This data set contains 113,937 loans with 81 variables on each loan, including loan amount, borrower rate (or interest rate), current loan status, borrower income, and many others. The dataset contains so-called listings which either have been transformed to a loan or not. Partially funded loans are possible as well. My main overall interest might be why and who is becoming a so-called Prosper borrower and furthermore what is mainly influencing the interest rate. Interesting would be how the average Prosper rate is compared to the normal financial market.
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
from scipy.stats import norm
from scipy.stats import normaltest
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
%matplotlib inline
warnings.filterwarnings("always")
sb.set(style="whitegrid")
# switch off warnings
import warnings
warnings.filterwarnings("ignore")
# function library
########################################### metric_distri_qq(data, figsize, bin_size, tick_size)
# Plot the Distribution and the QQ Plot
## Input variables
#### data: the series which is analyzed
#### fig_size: e.g. [12,7]
#### bin_size: eg. 0.01 --> that's how the bins will be created
#### ticksize: e.g. 0.01 --> that' how the ticks on the x-axis will be created.
def metric_distri_qq(data, figsize, bin_size, tick_size):
second_color = 'orange'
# this is the normal blue color
base_color = (0.2980392156862745, 0.4470588235294118, 0.6901960784313725)
# Preparation
data_min = data.min()
data_max = data.max()
number_of_values = len(data)
# quantile split @ 0,5
qs = ( np.arange( number_of_values ) - .5) / number_of_values
# create the plot
plt.figure(figsize=figsize)
# left subplot: plot the data
# create a histogram of the data
plt.subplot(1, 2, 1)
# create a histogram of the data
bin_edges = np.arange(data_min - bin_size, data_max + bin_size, bin_size)
plt.hist(data, bins = bin_edges, color = base_color);
# optimize bins
plt.xticks(np.arange(0, data_max + tick_size, tick_size))
# overlay a theoretical normal distribution on top
samp_mean = data.mean()
samp_sd = data.std()
x = np.linspace(data_min, data_max, 1000)
# normal distribution heights
y = norm.pdf(x, samp_mean, samp_sd)
# scale the distribution height
y *= number_of_values * bin_size
# rotate
plt.xticks(rotation=50)
# plot the normal distribution
plt.plot(x, y, color = second_color);
plt.xlabel(data.name)
plt.ylabel('Count');
plt.title('Obs. Value Distr. + Theoretical Normal Distr.')
# right subplot: create a Q-Q plot
plt.subplot(1, 2, 2)
data_min_scaled = (data_min - samp_mean) / samp_sd
data_max_scaled = (data_max - samp_mean) / samp_sd
expected_scores = norm.ppf(qs)
data_scores = (data.sort_values() - samp_mean) / samp_sd
plt.scatter(expected_scores, data_scores, color = base_color)
plt.plot([data_min_scaled-1,data_max_scaled+1],[data_min_scaled-1,data_max_scaled+1],'--', color = second_color)
plt.axis('equal')
plt.xlabel('Expected Standard Scores')
plt.ylabel('Obs. Standard Scores');
plt.title('Quantile-Quantile Plot')
########################################### metric_distri_qq2(data, figsize, bin_size, start_adj)
# Plot the Distribution (log) and the QQ Plot
## Input variables
#### data: the series which is analyzed already log transformed
#### fig_size: e.g. [12,7]
#### bin_size: eg. 0.01 --> that's how the bins will be created
#### start_adj: value where the tick start point is adjusted
def metric_distri_qq2(data, figsize, bin_size, start_adj):
second_color = 'orange'
# normal blue color
base_color = (0.2980392156862745, 0.4470588235294118, 0.6901960784313725)
# Preparation
data_min = data.min()
data_max = data.max()
number_of_values = len(data)
# quantile split @ 0,5
qs = (np.arange(number_of_values) - .5) / number_of_values
# create the plot
plt.figure(figsize=figsize)
# create a histogram of the data
plt.subplot(1, 2, 1)
log_bin_edges = np.arange(data_min+start_adj, data_max + bin_size, bin_size)
plt.hist(data, bins = log_bin_edges)
plt.xlabel(data.name + (' log10 Scale'))
plt.ylabel('Count');
plt.title('Obs. Log Transf. Value Distr. + Theoretical Normal Distr.')
# overlay a theoretical normal distribution on top
samp_mean = data.mean()
samp_sd = data.std()
x = np.linspace(data_min+start_adj, data_max, 200)
# normal distribution heights
y = norm.pdf(x, samp_mean, samp_sd)
# scale the distribution height
y *= number_of_values * bin_size
# rotate
plt.xticks(rotation=50)
# plot the normal distribution
plt.plot(x, y, color = second_color);
# right subplot: create a Q-Q plot
plt.subplot(1, 2, 2)
data_min_scaled = (data_min - samp_mean) / samp_sd
data_max_scaled = (data_max - samp_mean) / samp_sd
expected_scores = norm.ppf(qs)
data_scores = (data.sort_values() - samp_mean) / samp_sd
plt.scatter(expected_scores, data_scores, color = base_color)
plt.plot([data_min_scaled-1,data_max_scaled+1],[data_min_scaled-1,data_max_scaled+1],'--', color = second_color)
plt.axis('equal')
plt.xlabel('Expected Standard Scores')
plt.ylabel('Obs. Standard Scores');
plt.title('Quantile-Quantile Plot')
########################## get_first_threshold_count(any array, threshold, returns the index of the array and the value)
def get_first_threshold_count(counts, threshold=1):
j=0
#print(threshold)
for i in count:
j += 1
if i > threshold:
return(j,i)
break
########################## metric_distri_log(data, figsize, bin_size_n, tick_size_n, bin_size_l, tick_locs, initial)
# Plot the Normal Distribution together with the log scale one
## Input variables
#### bin_size_n = 0.005
#### tick_size_n = 0.04
#### bin_size_l = 0.01,
#### tick_locs e.g. [0.05, 0.08, 0.12, 0.16, 0.2 , 0.24, 0.28, 0.32, 0.36, 0.4 , 0.44, 0.48]
#### initial: True --> prints the bin sizes so one can make out of that the typical 1,3, 10 logsacle passed on ticklocs
def metric_distri_log(data, figsize, bin_size_n, tick_size_n, bin_size_l, tick_locs, initial):
# this is the normal blue color
base_color = (0.2980392156862745, 0.4470588235294118, 0.6901960784313725)
# preparation
data_min = data.min()
data_max = data.max()
# left histogram: data plotted in natural units
plt.figure(figsize = figsize)
plt.subplot(1, 2, 1)
bin_edges = np.arange(0, data_max + bin_size_n, bin_size_n)
plt.hist(data, bins = bin_edges, color = base_color)
plt.xticks(np.arange(0, data_max + tick_size_n, tick_size_n))
plt.xlabel(data.name + ' values')
plt.xticks(rotation=50)
plt.title('Obs. Values - Original Scale')
# right histogram: data plotted after direct log transformation
plt.subplot(1, 2, 2)
log_data = np.log10(data)
log_bin_edges = 10 ** np.arange(log_data.min(), log_data.max() + bin_size_l, bin_size_l)
plt.hist(data, bins = log_bin_edges);
plt.xscale('log')
plt.xticks(rotation=50)
plt.xlabel(data.name + ' log(values)')
plt.title('Obs. Values - log10 Scale');
plt.xticks(tick_locs, tick_locs);
plt.xlim(left=tick_locs[0])
#plt.xlim(left=0.03)
if initial == True:
print(bin_edges)
#categorial_2d(data, x, y, percent=True, zoom_x=0, zoom_y=0, fig_size=[12,5], sort=True, annotate=True )
# Plot a heatmap for 2 categorial attributes
## importing:
#### data=df : the data frame
#### x='Occupation' : the x attribute
#### y='BorrowerState': the y attribute
#### zoom_x=20 : zoom in the first x
#### zoom_y=20 : zoom in the first y
#### fig_size=[30,15] : picture size
#### percent=True : plot %
#### annotate=True : write the counts or %
def categorial_2d(data, x, y, percent=True, zoom_x=0, zoom_y=0, fig_size=[12,5], sort=True, annotate=True):
print(x,y)
# get the total couts
ct_counts = data.groupby([x, y]).size()
# restet the index from grupby
ct_counts = ct_counts.reset_index(name = 'count')
# make the pivot and organize the state as index(key and the occupations as colums)
ct_counts = ct_counts.pivot(index = y, columns = x, values = 'count')
if sort==True:
# get the nice sorting by state and occupation to concentrate the dense values ion the upper left corner
ct_counts = ct_counts.reindex(index=data[y].value_counts().index, columns=data[x].value_counts().index)
# replace NAs with 0
ct_counts.fillna(0,inplace=True)
if zoom_x & zoom_y != 0:
ct_counts = ct_counts.iloc[0:zoom_y, 0:zoom_x]
yticklabels=df[y].value_counts().index[0:zoom_y]
xticklabels=df[x].value_counts().index[0:zoom_x]
else:
yticklabels=df[y].value_counts().index
xticklabels=df[x].value_counts().index
#plot
if percent == True:
total_number = len(df)
print('Denominator: {}'.format(total_number))
# transform to %
ct_counts = ct_counts / total_number * 100
# create the plot
plt.figure(figsize=fig_size)
sb.heatmap(ct_counts, cmap='viridis_r',
annot = annotate, fmt = '.1f',
yticklabels=yticklabels,
xticklabels=xticklabels,
cbar_kws={'format': '%.2f%%'});
else:
plt.figure(figsize=fig_size)
sb.heatmap(ct_counts, cmap='viridis_r',
annot = annotate, fmt = '.0f',
yticklabels=yticklabels,
xticklabels=xticklabels)
#cbar_kws={'format': '.2f%%'});
Some Information about This Dataset can be found in my Google Drive link below. (https://drive.google.com/file/d/1SnL9_i9fbUx2M2MYTxwZ5jyLbOeLvQAD/view?usp=sharing). <------
Bid Object: A Bid is created when a Lender wishes to lend money to a Borrower in response to a Listing the Borrower created to solicit Bids. Bids are created by specifying an Amount and a Minimum Rate in which the Lender wishes to receive should the Bid win the auction and become a Loan. The Minimum Rate remains private unless the Bid is Outbid by other Bids offering a lower Minimum Rate.
Category Object: A Category is collection of Groups which share a common interest or affiliation. Categories are created by the Prosper Team. Group Leaders can associate their Group with one or more categories as they relate to their group.
CreditProfile Object: A CreditProfile is a timestamped set of extended credit information for a Member. Row level display and publication of CreditProfile is explicitly forbidden.
Group Object: A Group is a collection of Members who share a common interest or affiliation. Groups are managed by Group Leaders who bring borrowers to Prosper, maintain the group's presence on the site, and collect and/or share Group Rewards. Borrowers who are members of a group often get better interest rates because Lenders tend to have more confidence in Borrowers that belong to trusted Groups.
Listing Object: A Listing is created by a Borrower to solicit bids by describing themselves and the reason they are looking to borrow money. If the Listing receives enough bids by Lenders to reach the Amount Requested, then after the Listing period ends it will become a Loan. A Borrower may only have one active listing at a particular moment in time.
Loan Object: A Loan is created when a Borrower has received enough Bids to meet the full amount of money that the Borrower requested in their Listing. The Borrower must then make payments on the Loan to keep its status current.
Loan Performance Object: A LoanPerformance is an event in a Loan History that causes a change in loan value. This table can be used to calculate roll rates. Row level display and publication of LoanPerformance is explicitly forbidden.
Marketplace Object: The Marketplace is a collection of metrics and statistics about the Prosper Marketplace. These metrics are calculated daily. Historical metrics are provided as well.
Member Object: A Member is a registered user of the Prosper Marketplace site. A Member may have one or multiple roles which determines which actions the Member is allowed to perform on the site.
Based on my high level questions I think these are the main attributes:
I examined the structure of the dataset utilizing the mentioned sources and categorized 3 main areas with the following attributes. The main attributes are referenced as bold below. I assume the other attributes are helping to explain variations and patterns observed in the data. However, they might be not taken into consideration, depending on the analysis.
# load the csv
df=pd.read_csv('prosperLoanData.csv')
df_copy=df.copy()
# harmonize and shrink the dataset based on the above mentioned rationales
# read the selected attributes
attr_2_be_del = pd.read_csv('Attributes_Not_Selected.csv')
# drop the columns not needed
for attr in attr_2_be_del.Attributes:
df.drop([attr], axis=1, inplace=True)
# rename some colums with blanks
for column in df:
df.rename(index=str, columns={column: column.replace(' ', '')}, inplace=True)
for column in df:
df.rename(index=str, columns={column: column.replace('(', '_')}, inplace=True)
for column in df:
df.rename(index=str, columns={column: column.replace(')', '')}, inplace=True)
# Get an overview on the remaining colums
# set the base color
base_color = sb.color_palette()[0]
# plot a horizontal bar chart for missing values
null_counts = df.isnull().sum()
null_counts = null_counts[null_counts>0]
plt.figure(figsize=(8,10))
sb.barplot(x=null_counts, y=np.arange(len(null_counts)), orient = 'h', color=base_color)
plt.xlabel('Absolute number of missing values')
plt.xticks(np.arange(0,130000,10000), rotation=20)
plt.yticks(np.arange(len(null_counts)),null_counts.index,rotation='horizontal');
# annotate with %
for i in np.arange(len(null_counts)):
prop = null_counts[i]/len(df)
if prop > 0:
pct_string = '{:0.1f}%'.format(100*prop)
#print(pct_string)
plt.text(null_counts[i]+500, i, pct_string, va = 'center')
# annotate a small number>
plt.text(20000, 1, '<--- 25 NAs in total!', va = 'center', color='r');
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 113937 entries, 0 to 113936 Data columns (total 53 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingNumber 113937 non-null int64 1 ListingCreationDate 113937 non-null object 2 Term 113937 non-null int64 3 LoanStatus 113937 non-null object 4 ClosedDate 55089 non-null object 5 BorrowerAPR 113912 non-null float64 6 BorrowerRate 113937 non-null float64 7 LenderYield 113937 non-null float64 8 EstimatedEffectiveYield 84853 non-null float64 9 EstimatedLoss 84853 non-null float64 10 EstimatedReturn 84853 non-null float64 11 ProsperRating_numeric 84853 non-null float64 12 ProsperRating_Alpha 84853 non-null object 13 ProsperScore 84853 non-null float64 14 ListingCategory_numeric 113937 non-null int64 15 BorrowerState 108422 non-null object 16 Occupation 110349 non-null object 17 EmploymentStatus 111682 non-null object 18 EmploymentStatusDuration 106312 non-null float64 19 IsBorrowerHomeowner 113937 non-null bool 20 CreditScoreRangeLower 113346 non-null float64 21 CreditScoreRangeUpper 113346 non-null float64 22 CurrentCreditLines 106333 non-null float64 23 OpenCreditLines 106333 non-null float64 24 TotalCreditLinespast7years 113240 non-null float64 25 InquiriesLast6Months 113240 non-null float64 26 CurrentDelinquencies 113240 non-null float64 27 AmountDelinquent 106315 non-null float64 28 DelinquenciesLast7Years 112947 non-null float64 29 RevolvingCreditBalance 106333 non-null float64 30 BankcardUtilization 106333 non-null float64 31 AvailableBankcardCredit 106393 non-null float64 32 DebtToIncomeRatio 105383 non-null float64 33 IncomeRange 113937 non-null object 34 IncomeVerifiable 113937 non-null bool 35 StatedMonthlyIncome 113937 non-null float64 36 TotalProsperLoans 22085 non-null float64 37 TotalProsperPaymentsBilled 22085 non-null float64 38 OnTimeProsperPayments 22085 non-null float64 39 ProsperPaymentsLessThanOneMonthLate 22085 non-null float64 40 ProsperPaymentsOneMonthPlusLate 22085 non-null float64 41 ProsperPrincipalBorrowed 22085 non-null float64 42 ProsperPrincipalOutstanding 22085 non-null float64 43 ScorexChangeAtTimeOfListing 18928 non-null float64 44 LoanOriginalAmount 113937 non-null int64 45 LoanOriginationDate 113937 non-null object 46 MemberKey 113937 non-null object 47 MonthlyLoanPayment 113937 non-null float64 48 PercentFunded 113937 non-null float64 49 Recommendations 113937 non-null int64 50 InvestmentFromFriendsCount 113937 non-null int64 51 InvestmentFromFriendsAmount 113937 non-null float64 52 Investors 113937 non-null int64 dtypes: bool(2), float64(34), int64(7), object(10) memory usage: 45.4+ MB
Comment: Ok, let's comment and decide what to do?
Ther is fundamaental amount missing for closed date, which I believe is mainly die to the fact that the majority of loans are still running. Furthermore the section at the bottom, introduced bay TotalProspertLoans, TotalProsperPaymentBilled, etc. are aggregations based on historical data, hence if we haev first time lenders, those will be not filled.
Start with the Univariate Analysis ...
The Prosper Rating assigned at the time the listing was created between AA - HR. Applicable for loans originated after July 2009. 0 - N/A, 1 - HR, 2 - E, 3 - D, 4 - C, 5 - B, 6 - A, 7 - AA.
df.ProsperRating_numeric.value_counts()
4.0 18345 5.0 15581 6.0 14551 3.0 14274 2.0 9795 1.0 6935 7.0 5372 Name: ProsperRating_numeric, dtype: int64
df.ProsperRating_Alpha.value_counts()
C 18345 B 15581 A 14551 D 14274 E 9795 HR 6935 AA 5372 Name: ProsperRating_Alpha, dtype: int64
df.ProsperRating_Alpha.dtype
dtype('O')
df[['ProsperRating_Alpha', 'CreditScoreRangeLower', 'CreditScoreRangeUpper']].head(10)
| ProsperRating_Alpha | CreditScoreRangeLower | CreditScoreRangeUpper | |
|---|---|---|---|
| 0 | NaN | 640.0 | 659.0 |
| 1 | A | 680.0 | 699.0 |
| 2 | NaN | 480.0 | 499.0 |
| 3 | A | 800.0 | 819.0 |
| 4 | D | 680.0 | 699.0 |
| 5 | B | 740.0 | 759.0 |
| 6 | E | 680.0 | 699.0 |
| 7 | C | 700.0 | 719.0 |
| 8 | AA | 820.0 | 839.0 |
| 9 | AA | 820.0 | 839.0 |
Comment: As we have 25% not populated because the Prosper Rating started after July 2009 I thought maybe a simple rule (or even regression) for the derivation of the Prosper
rating based on the external one would be easy. But it is not as e.g. D is between 680 - 699 and E later down as well. So let's flag them as before_July09
and analyze keeping decide at th end to keep or to get rid of them.
df.ProsperRating_Alpha.fillna('before_July09', inplace=True)
df.ProsperRating_Alpha.value_counts()
before_July09 29084 C 18345 B 15581 A 14551 D 14274 E 9795 HR 6935 AA 5372 Name: ProsperRating_Alpha, dtype: int64
# ok it's string let's convert into a categorial var with the right order.
#survey_responses = pd.Categorical(responses, categories=["happy", "content", "sad"], ordered=True)
df.ProsperRating_Alpha = pd.Categorical(df.ProsperRating_Alpha, categories=["AA", "A", "B", "C", "D", "E", "HR", "before_July09"], ordered=True)
df.ProsperRating_Alpha.describe()
count 113937 unique 8 top before_July09 freq 29084 Name: ProsperRating_Alpha, dtype: object
df.ProsperRating_Alpha.sort_values().head(10)
92781 AA 52828 AA 29329 AA 52838 AA 96696 AA 29302 AA 12779 AA 82795 AA 29286 AA 96714 AA Name: ProsperRating_Alpha, dtype: category Categories (8, object): ['AA' < 'A' < 'B' < 'C' < 'D' < 'E' < 'HR' < 'before_July09']
# let's visualize
plt.figure(figsize = [12, 5])
sb.countplot(data = df, x = 'ProsperRating_Alpha',color=base_color)
plt.xticks(rotation=90);
Comment: For further analysis, we need to exclude the records "before_July09" (so let's remove them). The other rating counts are nicely centred around "C" which divides lower risks and higher risks.
df = df[df.ProsperRating_Alpha != 'before_July09']
# let's visualize
plt.figure(figsize = [12, 5])
sb.countplot(data = df, x = 'ProsperRating_Alpha',color=base_color)
plt.xticks(rotation=90);
df['ProsperRating_Alpha'] = df['ProsperRating_Alpha'].cat.remove_categories(['before_July09'])
A custom risk score built using historical Prosper data. The score ranges from 1-10, with 10 being the best, or lowest risk score. Applicable for loans originated after July 2009.
df.ProsperScore.value_counts()
4.0 12595 6.0 12278 8.0 12053 7.0 10597 5.0 9813 3.0 7642 9.0 6911 2.0 5766 10.0 4750 11.0 1456 1.0 992 Name: ProsperScore, dtype: int64
print(df.ProsperScore.nunique(), df.ProsperScore.dtype)
11 float64
df.ProsperScore.isna().sum()
0
Comment: ok it's 11 levels as seen in the data and in that link.
# let's visualize
plt.figure(figsize = [12, 5])
sb.countplot(data = df, x = 'ProsperScore',color=base_color)
plt.xticks(rotation=90);
Comment: The other scores counts are somehow centred around "6" which divides lower risks and higher risks. Although the 8 and 4 are as well very dominant.
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 84853 entries, 1 to 113936 Data columns (total 53 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingNumber 84853 non-null int64 1 ListingCreationDate 84853 non-null object 2 Term 84853 non-null int64 3 LoanStatus 84853 non-null object 4 ClosedDate 26005 non-null object 5 BorrowerAPR 84853 non-null float64 6 BorrowerRate 84853 non-null float64 7 LenderYield 84853 non-null float64 8 EstimatedEffectiveYield 84853 non-null float64 9 EstimatedLoss 84853 non-null float64 10 EstimatedReturn 84853 non-null float64 11 ProsperRating_numeric 84853 non-null float64 12 ProsperRating_Alpha 84853 non-null category 13 ProsperScore 84853 non-null float64 14 ListingCategory_numeric 84853 non-null int64 15 BorrowerState 84853 non-null object 16 Occupation 83520 non-null object 17 EmploymentStatus 84853 non-null object 18 EmploymentStatusDuration 84834 non-null float64 19 IsBorrowerHomeowner 84853 non-null bool 20 CreditScoreRangeLower 84853 non-null float64 21 CreditScoreRangeUpper 84853 non-null float64 22 CurrentCreditLines 84853 non-null float64 23 OpenCreditLines 84853 non-null float64 24 TotalCreditLinespast7years 84853 non-null float64 25 InquiriesLast6Months 84853 non-null float64 26 CurrentDelinquencies 84853 non-null float64 27 AmountDelinquent 84853 non-null float64 28 DelinquenciesLast7Years 84853 non-null float64 29 RevolvingCreditBalance 84853 non-null float64 30 BankcardUtilization 84853 non-null float64 31 AvailableBankcardCredit 84853 non-null float64 32 DebtToIncomeRatio 77557 non-null float64 33 IncomeRange 84853 non-null object 34 IncomeVerifiable 84853 non-null bool 35 StatedMonthlyIncome 84853 non-null float64 36 TotalProsperLoans 19797 non-null float64 37 TotalProsperPaymentsBilled 19797 non-null float64 38 OnTimeProsperPayments 19797 non-null float64 39 ProsperPaymentsLessThanOneMonthLate 19797 non-null float64 40 ProsperPaymentsOneMonthPlusLate 19797 non-null float64 41 ProsperPrincipalBorrowed 19797 non-null float64 42 ProsperPrincipalOutstanding 19797 non-null float64 43 ScorexChangeAtTimeOfListing 16642 non-null float64 44 LoanOriginalAmount 84853 non-null int64 45 LoanOriginationDate 84853 non-null object 46 MemberKey 84853 non-null object 47 MonthlyLoanPayment 84853 non-null float64 48 PercentFunded 84853 non-null float64 49 Recommendations 84853 non-null int64 50 InvestmentFromFriendsCount 84853 non-null int64 51 InvestmentFromFriendsAmount 84853 non-null float64 52 Investors 84853 non-null int64 dtypes: bool(2), category(1), float64(34), int64(7), object(9) memory usage: 35.8+ MB
The Occupation selected by the Borrower at the time they created the listing.
df.Occupation.value_counts()
Other 21317
Professional 10542
Executive 3468
Computer Programmer 3236
Teacher 2888
...
Judge 22
Student - College Freshman 17
Student - College Sophomore 16
Student - Community College 10
Student - Technical School 2
Name: Occupation, Length: 67, dtype: int64
Comment: Oh my god, seems to be free text entry, actually not that bad. 67 different professions, majority is in Others, btw the missing values (3.1%) we can move to others
# change Occupation NAs to "Others"
df['Occupation'].fillna(df.Occupation.value_counts().index[0], inplace=True)
df.Occupation.value_counts().isnull().sum()
0
df['Occupation'] = df['Occupation'].astype('category')
df.Occupation.dtype
CategoricalDtype(categories=['Accountant/CPA', 'Administrative Assistant', 'Analyst',
'Architect', 'Attorney', 'Biologist', 'Bus Driver',
'Car Dealer', 'Chemist', 'Civil Service', 'Clergy',
'Clerical', 'Computer Programmer', 'Construction', 'Dentist',
'Doctor', 'Engineer - Chemical', 'Engineer - Electrical',
'Engineer - Mechanical', 'Executive', 'Fireman',
'Flight Attendant', 'Food Service',
'Food Service Management', 'Homemaker', 'Investor', 'Judge',
'Laborer', 'Landscaping', 'Medical Technician',
'Military Enlisted', 'Military Officer', 'Nurse (LPN)',
'Nurse (RN)', 'Nurse's Aide', 'Other', 'Pharmacist',
'Pilot - Private/Commercial',
'Police Officer/Correction Officer', 'Postal Service',
'Principal', 'Professional', 'Professor', 'Psychologist',
'Realtor', 'Religious', 'Retail Management',
'Sales - Commission', 'Sales - Retail', 'Scientist',
'Skilled Labor', 'Social Worker',
'Student - College Freshman',
'Student - College Graduate Student',
'Student - College Junior', 'Student - College Senior',
'Student - College Sophomore', 'Student - Community College',
'Student - Technical School', 'Teacher', 'Teacher's Aide',
'Tradesman - Carpenter', 'Tradesman - Electrician',
'Tradesman - Mechanic', 'Tradesman - Plumber',
'Truck Driver', 'Waiter/Waitress'],
ordered=False)
# let's do a bar plot
plt.figure(figsize = [20, 8])
base_color = sb.color_palette()[0]
sb.countplot(data = df, x = 'Occupation',color=base_color, order=df['Occupation'].value_counts().index)
plt.xticks(rotation=90);
# let's show the values after "Others"
plt.figure(figsize = [20, 8])
sb.countplot(data = df, x = 'Occupation',color=base_color,
order=df['Occupation'].value_counts().iloc[1:].index)
plt.xticks(rotation=90);
Comment: Leaving the occupation euqals to "Other" out gives a good picture on the different professions, however most counted vales are "Other" and "Professional".
The employment status of the borrower at the time they posted the listing.
df.EmploymentStatus.value_counts()
Employed 67310 Full-time 7927 Self-employed 4538 Other 3806 Not employed 649 Retired 367 Part-time 256 Name: EmploymentStatus, dtype: int64
df.EmploymentStatus.isna().sum()
0
Comment: Change to catergorial.
# change to categorial
df['EmploymentStatus'] = df['EmploymentStatus'].astype('category')
# let's do a bar plot
plt.figure(figsize = [10, 5])
base_color = sb.color_palette()[0]
sb.countplot(data = df, x = 'EmploymentStatus',color=base_color, order=df['EmploymentStatus'].value_counts().index)
plt.xticks(rotation=90);
Comment: Most values are labeled as "Employed". 2nd highest is "Full-time" which could be as well "Employed". The same applies for "Part-time", where is the real distinguishing to "Employeed".
A dimension reduction strategy here is to combine already categories: Employed + Full-time = Employed(full-time)
# first add new categories
df['EmploymentStatus'] = df['EmploymentStatus'].cat.add_categories(['Employed (full-time)'])
# now get the Full-times and change to the Employed(full-time)
full_time = df[df['EmploymentStatus'] == 'Full-time'].index
df.loc[full_time, 'EmploymentStatus'] = 'Employed (full-time)'
# change as well the Employed to Employed(full-time)
full_time = df[df['EmploymentStatus'] == 'Employed'].index
df.loc[full_time, 'EmploymentStatus'] = 'Employed (full-time)'
# get rid of the consolidated categories
df['EmploymentStatus'] = df['EmploymentStatus'].cat.remove_categories(['Employed', 'Full-time'])
# let's do a bar plot
plt.figure(figsize = [10, 5])
base_color = sb.color_palette()[0]
sb.countplot(data = df, x = 'EmploymentStatus',color=base_color, order=df['EmploymentStatus'].value_counts().index)
plt.xticks(rotation=90);
print(df['EmploymentStatus'].cat.categories)
Index(['Not employed', 'Other', 'Part-time', 'Retired', 'Self-employed',
'Employed (full-time)'],
dtype='object')
Comment: Most lenders are "Employed (full-time) only a small proportion is "Self-employed". Even less is the proportion of "Part-time" workers.
A Borrower will be classified as a homowner if they have a mortgage on their credit profile or provide documentation confirming they are a homeowner.
sorted_counts=df.IsBorrowerHomeowner.value_counts().sort_values(ascending = False)
sorted_counts
True 44848 False 40005 Name: IsBorrowerHomeowner, dtype: int64
# plot a piechart
plt.pie(sorted_counts, labels = ['yes', 'no'], startangle = 90, autopct='%1.1f%%',
counterclock = True, colors=['lightblue','skyblue']);
plt.axis('square')
plt.title('Is Borrower Homeowner?');
# I don' like the boolean variable false.
df['IsBorrowerHomeowner'].replace({True: 'Yes', False: 'No'}, inplace=True)
df.IsBorrowerHomeowner = pd.Categorical(df.IsBorrowerHomeowner,
categories=['Yes', 'No'], ordered=True)
df.IsBorrowerHomeowner.value_counts()
Yes 44848 No 40005 Name: IsBorrowerHomeowner, dtype: int64
Comment: Nearly evenly distributed, this is nice and can be analyzed further. In Bivariate e.g. together with income etc...
The two letters abbreviation of the state of the address of the borrower at the time the Listing was created.
df.BorrowerState.value_counts(normalize=True)
CA 0.126819 NY 0.068872 TX 0.066397 FL 0.063710 IL 0.050252 OH 0.039751 GA 0.039445 VA 0.032751 NJ 0.032114 PA 0.031560 MI 0.030900 NC 0.028744 MD 0.026410 WA 0.025373 MA 0.021578 MO 0.021131 CO 0.020376 MN 0.020211 IN 0.019469 TN 0.018125 WI 0.017925 CT 0.017583 AZ 0.015992 OR 0.014307 AL 0.013989 NV 0.012068 SC 0.011714 KY 0.010465 KS 0.010064 LA 0.009911 AR 0.009063 OK 0.008627 MS 0.007955 NE 0.006541 UT 0.006152 NH 0.005221 RI 0.004820 ID 0.004749 HI 0.004030 NM 0.003901 DC 0.003866 WV 0.003653 DE 0.003147 MT 0.002605 SD 0.002227 VT 0.002015 AK 0.001968 WY 0.001450 Name: BorrowerState, dtype: float64
# Missing Values
df.BorrowerState.isnull().sum()
0
# originally I had the prosper_score (with the 25% missing values mot in the beginning)
# Let's distribute the NAs propotional to the States observed
# https://stackoverflow.com/questions/44867219/pandas-filling-na-values-to-be-filled-based-on-distribution-of-existing-values
null_counts = df.BorrowerState.value_counts(normalize=True)
#null_counts
# assign the missing values
#missing = df['BorrowerState'].isnull()
# take a sample from the missing states based on the proportion (probability) of it's counts and assign it to the
# missing values
#df.loc[missing,'BorrowerState'] = np.random.choice(null_counts.index, size=len(df[missing]),p=null_counts.values)
# change to categorial
df['BorrowerState'] = df['BorrowerState'].astype('category')
print(df.BorrowerState.isnull().sum())
0
# let's show the values after "Others"
plt.figure(figsize = [20, 8])
sb.countplot(data = df, x = 'BorrowerState',color=base_color,
order=df['BorrowerState'].value_counts().index)
plt.xticks(rotation=90);
Comments: Ok, California is leading clearly that distribution, might be that all of those "Computer Programmers" are located in Silicon Valley.
The date the listing was created.
df.ListingCreationDate.describe()
count 84853 unique 83980 top 2013-10-02 17:20:16.550000000 freq 6 Name: ListingCreationDate, dtype: object
Comment: The data typs is string let's covert to date
df.ListingCreationDate = pd.to_datetime(df['ListingCreationDate'])
df['ListingCreationDate'].dt.week
df['ListingCreationDate'].dt.year
yearly_dev = df.groupby(df['ListingCreationDate'].dt.year)['ListingCreationDate'].count().reset_index(name = 'Yearly_Count')
plt.errorbar(data = yearly_dev, x = 'ListingCreationDate', y = 'Yearly_Count')
years = np.arange(2009, 2015, 1)
plt.xticks(years,rotation=90);
df.ListingCreationDate.max()
Timestamp('2014-03-10 12:20:53.760000')
Comment: There is a clear upward over the years, as 2014 is only available till mid of march it is not really a drop.
The date the loan was originated.
df.LoanOriginationDate.describe()
count 84853 unique 1165 top 2014-01-22 00:00:00 freq 491 Name: LoanOriginationDate, dtype: object
df.LoanOriginationDate = pd.to_datetime(df['LoanOriginationDate'])
yearly_dev = df.groupby(df['LoanOriginationDate'].dt.year)['LoanOriginationDate'].count().reset_index(name = 'Yearly_Count')
plt.errorbar(data = yearly_dev, x = 'LoanOriginationDate', y = 'Yearly_Count')
years = np.arange(2009, 2015, 1)
plt.xticks(years,rotation=90);
Comment: Nearly the same as the ListingCreationDate.
The category of the listing that the borrower selected when posting their listing: 0 - Not Available, 1 - Debt Consolidation, 2 - Home Improvement, 3 - Business, 4 - Personal Loan, 5 - Student Use, 6 - Auto, 7- Other, 8 - Baby&Adoption, 9 - Boat, 10 - Cosmetic Procedure, 11 - Engagement Ring, 12 - Green Loans, 13 - Household Expenses, 14 - Large Purchases, 15 - Medical/Dental, 16 - Motorcycle, 17 - RV, 18 - Taxes, 19 - Vacation, 20 - Wedding Loans
df.ListingCategory_numeric.value_counts(normalize=True)
1 0.626731 7 0.108635 2 0.080150 3 0.062437 6 0.026363 13 0.023523 15 0.017937 18 0.010430 14 0.010324 20 0.009086 19 0.009051 16 0.003583 5 0.003229 11 0.002557 8 0.002345 10 0.001072 9 0.001002 12 0.000695 17 0.000613 0 0.000236 Name: ListingCategory_numeric, dtype: float64
# transform the variable
## step 1: create a pandas series index 0 = Not Avalibale....
data = np.array(['Not Available','Debt Consolidation','Home Improvement','Business',
'Personal Loan','Student Use','Auto','Other','Baby&Adoption',
'Boat','Cosmetic Procedure','Engagement Ring','Green Loans','Household Expenses',
'Large Purchases','Medical/Dental','Motorcycle','RV','Taxes',
'Vacation','Wedding Loans'])
listing_category_alpha = pd.Series(data)
## Step 2 define a function which gives based on zeh index the cat string
def trans_2_cat(x):
category = listing_category_alpha[x]
return category
## create a new attribute
df['ListingCategory_alpha'] = df['ListingCategory_numeric'].apply(trans_2_cat)
## transform to categorial
df['ListingCategory_alpha'] = df['ListingCategory_alpha'].astype('category')
df.ListingCategory_alpha.value_counts(normalize=True)
Debt Consolidation 0.626731 Other 0.108635 Home Improvement 0.080150 Business 0.062437 Auto 0.026363 Household Expenses 0.023523 Medical/Dental 0.017937 Taxes 0.010430 Large Purchases 0.010324 Wedding Loans 0.009086 Vacation 0.009051 Motorcycle 0.003583 Student Use 0.003229 Engagement Ring 0.002557 Baby&Adoption 0.002345 Cosmetic Procedure 0.001072 Boat 0.001002 Green Loans 0.000695 RV 0.000613 Not Available 0.000236 Name: ListingCategory_alpha, dtype: float64
# let's visualize
plt.figure(figsize = [20, 8])
sb.countplot(data = df, x = 'ListingCategory_alpha',color=base_color,
order=df['ListingCategory_alpha'].value_counts().index)
plt.xticks(rotation=90);
Comment: This is not a surprise, actually in the various articles I read about P2P lending, it is often mentioned that this type of loan is ideal to consolidate different loans including credit card debts. Here we can see that more than the half of lall loans are used for that purpose. Also here
merge "Not Available" to Others.
# first add new categories
#df['ListingCategory_alpha'] = df['ListingCategory_alpha'].cat.add_categories(['Employed (full-time)'])
# get the not avalible and chnage it to Other
na_ = df[df['ListingCategory_alpha'] == 'Not Available'].index
df.loc[na_, 'ListingCategory_alpha'] = 'Other'
# remove the unused cat
df['ListingCategory_alpha'] = df['ListingCategory_alpha'].cat.remove_categories(['Not Available'])
#df['EmploymentStatus'] = df['EmploymentStatus'].cat.add_categories(['Others (incl. n/a)'])
df['ListingCategory_alpha'].cat.categories
Index(['Auto', 'Baby&Adoption', 'Boat', 'Business', 'Cosmetic Procedure',
'Debt Consolidation', 'Engagement Ring', 'Green Loans',
'Home Improvement', 'Household Expenses', 'Large Purchases',
'Medical/Dental', 'Motorcycle', 'Other', 'RV', 'Student Use', 'Taxes',
'Vacation', 'Wedding Loans'],
dtype='object')
The Borrower's Annual Percentage Rate (APR) for the loan.
df.BorrowerAPR.isna().sum()
0
# let's plot the distribution, I know Q-Q plot is already bivraiate but it is already so useful
metric_distri_qq(df.BorrowerAPR, [12,5], 0.01, 0.02)
Comment: Difficult ... there are a lot of values at the end in the bins of 0,35 - 0,36 (btw. which is more than 30%). However looking to the distribution I would say it can be considered as normal distributed.
The Borrower's interest rate for this loan.
df.BorrowerRate.describe()
count 84853.000000 mean 0.196022 std 0.074631 min 0.040000 25% 0.135900 50% 0.187500 75% 0.257400 max 0.360000 Name: BorrowerRate, dtype: float64
# let's plot the distribution, I know Q-Q plot is already bivraiate but it is still so useful
metric_distri_qq(df.BorrowerRate, [12,5], 0.01, 0.02)
Comment: Very similar to the APR which is no surprise as the APR is including the fees and teg the rate, actually everthing the borrower needs to pay for. Actually a bit better then the APR as on there is only one spike far right in the bin of 0,33. This might be related to the spike of the BorrowerAPR.
df.Term.describe()
count 84853.000000 mean 42.486135 std 11.640346 min 12.000000 25% 36.000000 50% 36.000000 75% 60.000000 max 60.000000 Name: Term, dtype: float64
df.Term.isna().sum()
0
# let's visualize
plt.figure(figsize = [12, 5])
sb.countplot(data = df, x = 'Term',color=base_color)
plt.xticks(rotation=90);
Comment: 36 month term is clearly dominant, followed by 60 month. 12 month seems to be not that important.
df.LoanStatus.describe()
count 84853 unique 11 top Current freq 56576 Name: LoanStatus, dtype: object
df.LoanStatus.isna().sum()
0
plt.figure(figsize = [12, 5])
sb.countplot(data = df, x = 'LoanStatus',color=base_color)
plt.xticks(rotation=90);
df.LoanStatus.value_counts(normalize=True)
Current 0.666753 Completed 0.231742 Chargedoff 0.062885 Defaulted 0.011844 Past Due (1-15 days) 0.009499 Past Due (31-60 days) 0.004278 Past Due (61-90 days) 0.003689 Past Due (91-120 days) 0.003583 Past Due (16-30 days) 0.003123 FinalPaymentInProgress 0.002416 Past Due (>120 days) 0.000189 Name: LoanStatus, dtype: float64
Comment: Most of the loans are in good shape either "Completed" or "Current", some of the loans are in in a Past Due category indicating that the lender is behind the payment schedule.
Finally there is a considerable amount of loans defaulted (comes after the Past Due) and even more are finally Charged Off (appr. 6%). Which indicates the potential risk.
The debt to income ratio of the borrower at the time the credit profile was pulled. This value is Null if the debt to income ratio is not available. This value is capped at 10.01 (any debt to income ratio larger than 1000% will be returned as 1001%). The lower the better!
df.DebtToIncomeRatio.describe()
count 77557.000000 mean 0.258752 std 0.318326 min 0.000000 25% 0.150000 50% 0.220000 75% 0.320000 max 10.010000 Name: DebtToIncomeRatio, dtype: float64
print(df.DebtToIncomeRatio.isna().sum() , df[df['DebtToIncomeRatio'] == 0.0]['ListingNumber'].count())
7296 6
Comment: As I don't konw how the Ratio is excatly caluclated I need to exclude the NAs. The 19 records with 0.0 I treat as properly calculated
# let's plot the distribution, I know Q-Q plot is already bivraiate but it is still so useful
metric_distri_qq(df[df.DebtToIncomeRatio.notnull()]['DebtToIncomeRatio'], [12,5], 0.2, 1.0)
Comment: This isn't looking normal distributed at all. As it's a financial KPI and the original scale is pretty much right skewed, a log scale might better explain the distribution.
# ok let's do a scale log transformation
metric_distri_log(data = df[df.DebtToIncomeRatio.notnull() & df.DebtToIncomeRatio > 0]['DebtToIncomeRatio'],
figsize = [12, 5],
bin_size_n = 0.1,
tick_size_n = 1.0,
bin_size_l = 0.1,
tick_locs = [0.01, 0.02, 0.05, 0.1, 0.2, 0,5,1.0 ],
initial = False)
Comment: Better the log transformed scale shows a normal looking distribution.
log_d2ir = np.log10(df[df.DebtToIncomeRatio.notnull() & df.DebtToIncomeRatio > 0]['DebtToIncomeRatio'])
metric_distri_qq2(log_d2ir, [12,5], 0.1, 0.0)
Comment: The Q-Q Plot and the standard test isn't perfect but the log10 scale plot looks definitely more normal than the original scale.
#How many are 0
df.query('DebtToIncomeRatio == 0')['ListingNumber'].count()
6
# ok we eliminate them
df = df.query('DebtToIncomeRatio > 0')
# Let's store the log
df['DebtToIncomeRatio_ln'] = np.log10(df['DebtToIncomeRatio'])
df['DebtToIncomeRatio_ln'].describe()
count 77551.000000 mean -0.670481 std 0.262030 min -2.000000 25% -0.823909 50% -0.657577 75% -0.494850 max 1.000434 Name: DebtToIncomeRatio_ln, dtype: float64
The income range of the borrower at the time the listing was created.
df.IncomeRange.value_counts()
$50,000-74,999 24030 $25,000-49,999 22023 $100,000+ 14013 $75,000-99,999 13644 $1-24,999 3840 Not employed 1 Name: IncomeRange, dtype: int64
Comment: I think it could be better to look after the organic distribution of the income the borrowers have at hand at the time of the
listing creation. let's have a look on StatedMonthlyIncome.
The monthly income the borrower stated at the time the listing was created.
df.StatedMonthlyIncome.describe().apply(lambda x: format(x, '.2f'))
count 77551.00 mean 5940.30 std 4000.82 min 0.25 25% 3533.33 50% 5000.00 75% 7166.67 max 158333.33 Name: StatedMonthlyIncome, dtype: object
df[df.StatedMonthlyIncome == 0]['ListingNumber'].count()
0
# let's plot the distribution, I know Q-Q plot is already bivraiate but it is still so useful
metric_distri_qq(df.query('StatedMonthlyIncome > 0 & StatedMonthlyIncome <= 40000' )['StatedMonthlyIncome'], [12,5], 500, 5000)
Comment: I could can only produce a meaningful plot by filtering below 40k. The data is exremly right skewed and definitely on the original sclae not normal distributed.
# ok let's do a scale log transformation
metric_distri_log(data = df.query('StatedMonthlyIncome > 0 & StatedMonthlyIncome <= 100000' )['StatedMonthlyIncome'],
figsize = [12, 5],
bin_size_n = 2000,
tick_size_n = 10000,
bin_size_l = 0.05,
tick_locs = [500, 1000, 2500, 5000, 10000, 25000, 50000, 100000 ],
initial = False)
1750002.916667
df.query('StatedMonthlyIncome > 20000' )['StatedMonthlyIncome'].count()
793
Comment: A few outliers are pretty much moving the shape to the right. 943 "Monthly Incomes" are higher than 20000$.
log_d2ir = np.log10(df.query('StatedMonthlyIncome > 0 & StatedMonthlyIncome <= 100000' )['StatedMonthlyIncome'])
metric_distri_qq2(log_d2ir, [12,5], 0.05, 1)
Comment: The Q-Q Plot is not really underlining the normality. However the log transformed plot is really much more following a "bell-shape" then the original scale. Let's check as well closer in the up-coming analysis.
#How many are 0
df.query('StatedMonthlyIncome == 0')['ListingNumber'].count()
0
# Let's store the log
df['StatedMonthlyIncome_ln'] = np.log10(df['StatedMonthlyIncome'])
df['StatedMonthlyIncome_ln'].describe()
count 77551.000000 mean 3.702245 std 0.257954 min -0.602060 25% 3.548185 50% 3.698970 75% 3.855317 max 5.199572 Name: StatedMonthlyIncome_ln, dtype: float64
df.MonthlyLoanPayment.describe()
count 77551.000000 mean 295.766419 std 189.328517 min 0.000000 25% 158.490000 50% 256.390000 75% 392.280000 max 2251.510000 Name: MonthlyLoanPayment, dtype: float64
df.MonthlyLoanPayment.isna().sum()
0
df.MonthlyLoanPayment[df.MonthlyLoanPayment <= 10].count()
454
df[df.MonthlyLoanPayment == 0]['ListingNumber'].count()
443
df[df.MonthlyLoanPayment < 10]['LoanStatus'].value_counts()
Completed 432 Defaulted 18 FinalPaymentInProgress 4 Name: LoanStatus, dtype: int64
df[df.LoanStatus.isin(['Completed', 'Defaulted','FinalPaymentInProgress'])]['ListingNumber'].count()
18774
Comment: There are some values here which are extremely low below 10$ and they spread to different categories like completed etc. However out of thise categories the proportion of 0 is very low.
# let's plot the distribution, I know Q-Q plot is already bivraiate but it is still so useful
metric_distri_qq(df.MonthlyLoanPayment, [12,5], 35, 100)
Comment: Not looking normal on the original scale. Outliers are in the bins of 100 and 200
# ok let's do a scale log transformation
metric_distri_log(data = df.MonthlyLoanPayment[df.MonthlyLoanPayment > 0],
figsize = [12, 5],
bin_size_n = 30,
tick_size_n = 200,
bin_size_l = 0.1,
tick_locs = [10, 20, 50, 100, 200, 500, 1000, 2000],
initial = False)
log_data = np.log10(df.MonthlyLoanPayment[df.MonthlyLoanPayment > 0])
metric_distri_qq2(log_data, [12,5], 0.1, 1)
Comment: The log transformation seems to decribe the values pretty well.
df.query('MonthlyLoanPayment == 0')['PercentFunded'].mean()
0.9993555304740406
Normally we could look how to fill the loan payments, bu as it is small number we can get rid it.
df = df.query('MonthlyLoanPayment > 0')
df['MonthlyLoanPayment_ln'] = np.log10(df['MonthlyLoanPayment'])
df['MonthlyLoanPayment_ln'].describe()
count 77108.000000 mean 2.387007 std 0.284090 min -0.619789 25% 2.203495 50% 2.411670 75% 2.594183 max 3.352474 Name: MonthlyLoanPayment_ln, dtype: float64
The origination amount of the loan.
df.LoanOriginalAmount.describe()
count 77108.000000 mean 9300.966683 std 6398.995358 min 1000.000000 25% 4000.000000 50% 8000.000000 75% 14584.500000 max 35000.000000 Name: LoanOriginalAmount, dtype: float64
df.LoanOriginalAmount.isna().sum()
0
# let's plot the distribution, I know Q-Q plot is already bivraiate but it is still so useful
metric_distri_qq(df.LoanOriginalAmount, [12,5], 500, 5000)
Seems to be not normal....
# ok let's do a scale log transformation
metric_distri_log(data = df.LoanOriginalAmount,
figsize = [12, 5],
bin_size_n = 500,
tick_size_n = 5000,
bin_size_l = 0.05,
tick_locs = [500, 1000, 2500, 5000, 10000, 25000, 50000],
initial = False)
Looks a bit better however still not the ideal bell curve or close to it.
log_data = np.log10(df.LoanOriginalAmount)
metric_distri_qq2(log_data, [12,5], 0.05, 0.1)
Comment: Ok, looking better but still not normal. So I decide to work with the original scale.
Percent the listing was funded.
df.PercentFunded.describe()
count 77108.000000 mean 0.998156 std 0.020428 min 0.700000 25% 1.000000 50% 1.000000 75% 1.000000 max 1.012500 Name: PercentFunded, dtype: float64
Nice sometimes more was funded then requested...
df.PercentFunded.isna().sum()
0
# let's plot the distribution, I know Q-Q plot is already bivraiate but it is still so useful
metric_distri_qq(df.PercentFunded, [12,5], 0.005, 0.01)
Comment: Most of the times the full amount was paid out.
The estimated return assigned to the listing at the time it was created. Estimated return is the difference between the Estimated Effective Yield and the Estimated Loss Rate. Applicable for loans originated after July 2009.
# let's plot the distribution, I know Q-Q plot is already bivraiate but it is still so useful
metric_distri_qq(df.EstimatedReturn, [12,5], 0.01, 0.05)
df.query('EstimatedReturn <= 0')['ListingNumber'].count()
140
Comment: ok the Estimated return is sometimes 0 or negative. Apart from that it looks pretty normal. What about the Loss?
LossEstimated loss is the estimated principal loss on charge-offs. Applicable for loans originated after July 2009.
# let's plot the distribution, I know Q-Q plot is already bivraiate but it is still so useful
metric_distri_qq(df.EstimatedLoss, [12,5], 0.005, 0.05)
Comment: ok the Estimated return is sometimes 0 or negative. Apart from that it looks pretty normal. What about the Loss?
# ok let's do a scale log transformation
metric_distri_log(data = df.EstimatedLoss,
figsize = [12, 5],
bin_size_n = 0.005,
tick_size_n = 0.05,
bin_size_l = 0.025,
tick_locs = [0.005, 0.01 , 0.025, 0.05, 0.1,0.25, 0.5 ],
initial = False)
Comment: From right to left skewed. We will look in the further analysis and stay with the original scale.
Cleaning General:
The following attributes have being excluded: ListingKey, CreditGrade, CurrentlyInGroup, GroupKey, DateCreditPulled, FirstRecordedCreditLine, OpenRevolvingAccounts, OpenRevolvingMonthlyPayment, TotalInquiries, PublicRecordsLast10Years, PublicRecordsLast12Months, TotalTrades, ,TradesNeverDelinquent (percentage), TradesOpenedLast6Months, LoanKey, LoanCurrentDaysDelinquent, LoanFirstDefaultedCycleNumber, LoanMonthsSinceOrigination ,LoanNumber, LoanOriginationQuarter, LP_CustomerPayments, LP_CustomerPrincipalPayments, LP_InterestandFees, LP_ServiceFees, LP_CollectionFees, LP_GrossPrincipalLoss, LP_NetPrincipalLoss, LP_NonPrincipalRecoverypayments.
Prosper Rating:
As we have 25% not populated because the Prosper Rating started after July 2009 I thought maybe a simple rule (or even regression) for the derivation of the Prosper
rating based on the external one would be easy. But it is not as e.g. D is between 680 - 699 and E later down as well. So let's flag them as before_July09
and analyze keeping decide at th end to keep or to get rid of them.
BorrowerAPR: There are a lot of values at the end in the bins of 0,35 - 0,36 (btw. which is more than 30%). However looking to the distribution I would say it can be considered as normal distributed.
DebtToIncomeRatio: This isn't looking normal distributed at all. As it's a financial KPI and the original scale is pretty much right skewed, a log scale might better explain the distribution.
StatedMonthlyIncome: The Q-Q Plot is not really underlining the normality. However the log transformed plot is really much more following a "bell-shape" then the original scale.
MonthlyLoan Payment: Here are some values here which are extremely low below 10$ and they spread to different categories like completed etc. However out of those categories the proportion of 0 is very low. The log scale describes the data pretty normal.
EstimatedReturn:The Estimated return is sometimes 0 or negative. Apart from that it looks pretty normal.
EstimatedLoss: The data looks right-skewed, after the log transform it look left-skewed, so I decide to stay with the original scale.
Now move o to the bivariate exploration, first see if there is a any insight in the following attribute comparisons.
# cluster the variabless
numeric_vars = ['BorrowerAPR', 'BorrowerRate', 'DebtToIncomeRatio_ln', 'StatedMonthlyIncome_ln', 'MonthlyLoanPayment_ln',
'LoanOriginalAmount', 'EstimatedLoss', 'EstimatedReturn']
plt.figure(figsize = [15, 5])
# correlation plot
sb.heatmap(df[numeric_vars].corr(), annot = True, fmt = '.3f', cmap = 'vlag_r', center = 0);
# plot matrix: sample 7000 (appr. 10%) so that plots are clearer and they render faster
samples = np.random.choice(df.shape[0], 7000, replace = False)
pairplot_samp = df.iloc[samples,:]
# Pairplot of the variables interested....
g = sb.PairGrid(data = pairplot_samp, vars = numeric_vars)
g = g.map_diag(plt.hist, bins = 30);
g.map_offdiag(plt.scatter);
Ok, the correlation matrix and the pairplot show some good candidates for continous comparison. I continue based on th equestion I formulated.
categorial_2d(data=df, x='Occupation',y='BorrowerState',zoom_x=15,zoom_y=20,fig_size=[9,5], percent=True, annotate=False )
Occupation BorrowerState Denominator: 77108
Comment: As expected, the pivot of state and occupation shows that e.g. that the group "Others" and "Professionals" comimng from California (CA) are the most frequent. We have more computer programmers in California then other states.
Let's look if there is an intersting insight for lenders owing a house. Do they own houses?
Are there states with more homeowners in the Prosper Community?
plt.figure(figsize = [12, 5])
# plot and assign a axis
ax = sb.countplot(data = df, x = 'BorrowerState', hue = 'IsBorrowerHomeowner',
order=df['BorrowerState'].value_counts().index);
plt.xticks(rotation=90);
Comment: The Homeowner counts in Californa(CA) are much lesser than eg. Texas. In Florida is nearly even. The smaller the overall Prosper utilization gets the bigger the count of the homeowners get (with a few exceptions).
I'm not an US expert but comparing e.g. California and NY with WA as example is also comparing Metroplitan areas vs. counteyside. Which occupation do Homeowners have?
Are there occupation groups having more houses?
plt.figure(figsize = [5, 12])
# plot and assign a axis
ax = sb.countplot(data = df, y = 'Occupation', hue = 'IsBorrowerHomeowner',
order=df['Occupation'].value_counts().index);
plt.xticks(rotation=60);
# Zoom to the top 20
plt.figure(figsize = [12, 5])
# plot and assign a axis
ax = sb.countplot(data = df, x = 'Occupation', hue = 'IsBorrowerHomeowner',
order=df['Occupation'].value_counts().index[0:20]);
plt.xticks(rotation=90);
Comment: Group "Others" have less "Homeowners" than e.g. than "Professionals" the 2nd largest group. Executives tend to have more real estate property as well. Overall we know that the split of Homeowners and Non-Homeowners is nearly even. Which Employement Status do the different occupations have?
Which Occupations have which Employment Status?
categorial_2d(data=df, x='Occupation',y='EmploymentStatus',zoom_x=0,zoom_y=0,fig_size=[20,5], percent=True, annotate=False )
Occupation EmploymentStatus Denominator: 77108
Comment: Biggest group "Others" has most of them as "Employed". The following groups "Professional", "Comp. Progr.", "Executive" and have no (or neglectable) count of not employed or retired. Even part-time proportion is very low. So bottomline the Prosper Community sees to be Full-time employed.
There might be a relationship between state and listing type?
categorial_2d(data=df, x='ListingCategory_alpha',y='BorrowerState',zoom_x=0,zoom_y=0,fig_size=[10,10], percent=True, annotate=False )
ListingCategory_alpha BorrowerState Denominator: 77108
Comment: Nothing special here, California and Debt Consolidation collects most of the counts. Debt consolidation seems to be leading:
Do we have occupation categories doing a particular type of loan?
Are there proffessions doing a particular ListingCategory more often?
categorial_2d(data=df, x='ListingCategory_alpha',y='Occupation',zoom_x=0,zoom_y=0,fig_size=[10,10], percent=True, annotate=False)
ListingCategory_alpha Occupation Denominator: 77108
Comment: The Other categories are kind of dominating. Professionals do most Debt Consolidation. Followed by Home Improvement and Business. Not a big inside as we know already that the Consolidtion of loans is the strongest use case. Do homeowners do different things withe the money?
Are Homeowners more in the Home Improvement Busienss?
plt.figure(figsize = [12, 5])
# plot and assign a axis
ax = sb.countplot(data = df, x = 'ListingCategory_alpha', hue = 'IsBorrowerHomeowner',
order=df['ListingCategory_alpha'].value_counts().index);
plt.xticks(rotation=60, ha='right');
df.groupby(['ListingCategory_alpha', 'IsBorrowerHomeowner']).size()/len(df)*100
ListingCategory_alpha IsBorrowerHomeowner
Auto Yes 1.084194
No 1.505680
Baby&Adoption Yes 0.130985
No 0.116719
Boat Yes 0.047985
No 0.058360
Business Yes 2.582093
No 2.120403
Cosmetic Procedure Yes 0.042797
No 0.060953
Debt Consolidation Yes 35.454168
No 28.783006
Engagement Ring Yes 0.071329
No 0.188048
Green Loans Yes 0.029828
No 0.028531
Home Improvement Yes 5.747782
No 2.451108
Household Expenses Yes 1.086787
No 1.215179
Large Purchases Yes 0.439643
No 0.603050
Medical/Dental Yes 0.897443
No 0.914302
Motorcycle Yes 0.158220
No 0.216579
Other Yes 5.003372
No 5.749079
RV Yes 0.027235
No 0.037610
Student Use Yes 0.079110
No 0.176376
Taxes Yes 0.606941
No 0.408518
Vacation Yes 0.413705
No 0.514862
Wedding Loans Yes 0.359236
No 0.588785
dtype: float64
Comment: Debt Consolidation is highest for Homeowner as well. Surprisingly Home Improvement is done by non-Homeowner as well (appr. 2,5%).
Ok, by now not too much intersting insights let's explore further some quantitative attributes.
How much money do the different occupations have at hand?
df.StatedMonthlyIncome_ln.describe()
count 77108.000000 mean 3.702410 std 0.257981 min -0.602060 25% 3.549192 50% 3.698970 75% 3.855317 max 5.199572 Name: StatedMonthlyIncome_ln, dtype: float64
# create the means
occ_montly_income = df.groupby(['Occupation']).mean()
occ_montly_income_order = occ_montly_income.sort_values(['StatedMonthlyIncome'], ascending = False).index
# plot the vertical mean
def vertical_mean_line(x, **kwargs):
plt.axvline(x.mean(), color='red', linestyle='dashed', linewidth=2)
def mean_as_text(col):
print(col)
a = str(occ_montly_income.query('Occupation == @col')['StatedMonthlyIncome'].mean())
print(col + a)
return a
# create the facet plot
g = sb.FacetGrid(data = df, col = 'Occupation', col_wrap = 8, size = 4, sharex=False,col_order = occ_montly_income_order);
g.map(plt.hist, 'StatedMonthlyIncome', bins = np.arange(100, 20000, 500));
g.map(vertical_mean_line, 'StatedMonthlyIncome');
g.set_titles('{col_name}');
Comment: The histograms above are ordered by descending by the sum of avaibale incomme in that group.Interestingly the mean income (red line) has a different order. Let's look to that.
# make a bar chart vertical with a overall mean of the income
occ_mi_mean = df.groupby('Occupation')['StatedMonthlyIncome'].mean().reset_index(name = 'MeanIncome')
occ_mi_mean.MeanIncome = occ_mi_mean.MeanIncome.astype(float)
plt.figure(figsize = [5, 14])
sb.barplot(y="Occupation", x="MeanIncome", data=occ_mi_mean, color=base_color, order=occ_mi_mean.sort_values(['MeanIncome'], ascending = False)['Occupation']) ;
plt.xticks(rotation=90);
plt.xticks(np.arange(0,occ_mi_mean.MeanIncome.max()+1000,1000))
plt.axvline(df.StatedMonthlyIncome.mean(), color='red', linestyle='dashed', linewidth=2);
Comment: Ok, doctors have the most available income ...
plt.figure(figsize = [3, 5])
sb.violinplot(data = occ_mi_mean, x = 'MeanIncome', color = base_color);
Comment: However Professionals, Computer Programmers and the very large group Others are still above the average income.
Let's see how the overall debt situation is looking like..
Let's neglect the outlier for the momnet and have a look to the values up to the 3rd quantile.
df.DebtToIncomeRatio.describe()
count 77108.000000 mean 0.259043 std 0.318858 min 0.010000 25% 0.150000 50% 0.220000 75% 0.320000 max 10.010000 Name: DebtToIncomeRatio, dtype: float64
occ_d2ir_mean = df.groupby('Occupation')['DebtToIncomeRatio'].mean().reset_index(name = 'MeanDebt2IncomeRatio')
occ_d2ir_mean.MeanDebt2IncomeRatio = occ_d2ir_mean.MeanDebt2IncomeRatio.astype(float)
plt.figure(figsize = [3, 5])
sb.violinplot(data = occ_d2ir_mean, x = 'MeanDebt2IncomeRatio', color = base_color);
# let's get the mean DebtToIncomeRatio
occ_d2ir = df.groupby(['Occupation']).mean()
occ_d2ir = occ_d2ir.sort_values(['DebtToIncomeRatio'], ascending = False).index
# plot the vertical mean
def vertical_mean_line(x, **kwargs):
plt.axvline(x.mean(), color='red', linestyle='dashed', linewidth=2)
# create the plot
g = sb.FacetGrid(data = df, col = 'Occupation', col_wrap = 8, size = 4, sharex=False,col_order = occ_d2ir);
g.map(plt.hist, 'DebtToIncomeRatio', bins = np.arange(0, df.DebtToIncomeRatio.describe()[6]+0.3, 0.01));
g.map(vertical_mean_line, 'DebtToIncomeRatio');
g.set_titles('{col_name}');
# plot a barchart with overall mean
plt.figure(figsize = [5, 14])
sb.barplot(y="Occupation", x="MeanDebt2IncomeRatio", data=occ_d2ir_mean, color=base_color, order=occ_d2ir_mean.sort_values(['MeanDebt2IncomeRatio'], ascending = False)['Occupation']) ;
plt.xticks(rotation=90);
plt.xticks(np.arange(0,occ_d2ir_mean.MeanDebt2IncomeRatio.max()+0.1,0.1))
plt.axvline(df.DebtToIncomeRatio.describe()[1], color='red', linestyle='dashed', linewidth=2);
Comment: Teachers Aide, Waiters, Students and Fodd Service have the highest ratio. Computer Programmers, Professionals are below the average Ratio, however the large group of Others is slightly above. Whart about the extemes > 0,3
occ_d2ir = df.query('DebtToIncomeRatio > @df.DebtToIncomeRatio.describe()[6]')
metric_distri_qq(occ_d2ir.DebtToIncomeRatio, [12,5], .05, 1)
df.query('DebtToIncomeRatio > 1')['DebtToIncomeRatio'].count()
320
# let's plot the distribution, I know Q-Q plot is already bivraiate but it is already so useful
metric_distri_qq(df.query('DebtToIncomeRatio > 1')['DebtToIncomeRatio'], [12,5], .1, 1)
Comment: We have 323 above 1 which means those lenders have earn the same amount of money as they have debts(and evern more). There seems to be a concentration between 1 and 3. I don't consider them as they make only a small propoportion. Let's look to the high ratios.
# get the mean of all DebtToIncomeRatio <= 1
occ_d2ir = df.query('DebtToIncomeRatio > @df.DebtToIncomeRatio.describe()[6] & DebtToIncomeRatio <= 1')
occ_d2ir_sort = occ_d2ir.groupby(['Occupation']).mean()
occ_d2ir_sort = occ_d2ir_sort.sort_values(['DebtToIncomeRatio'], ascending = False).index
# plot the vertical mean
def vertical_mean_line(x, **kwargs):
plt.axvline(x.mean(), color='red', linestyle='dashed', linewidth=2)
# create the plot
g = sb.FacetGrid(data = occ_d2ir, col = 'Occupation', col_wrap = 8, size = 4, sharex=False, col_order = occ_d2ir_sort);
g.map(plt.hist, 'DebtToIncomeRatio', bins = np.arange(df.DebtToIncomeRatio.describe()[6], 1+0.025, 0.025))
g.map(vertical_mean_line, 'DebtToIncomeRatio');
g.set_titles('{col_name}');
Comment: Food Service, Sales/Retail, Clerical, Teachers as well as Others, Computer Programmers and Professionals have considerable amount of high ratio. How is the score looking for the occupations?
How much money do they borrow on average?
df.LoanOriginalAmount.describe()
count 77108.000000 mean 9300.966683 std 6398.995358 min 1000.000000 25% 4000.000000 50% 8000.000000 75% 14584.500000 max 35000.000000 Name: LoanOriginalAmount, dtype: float64
# get the mean LoanAmount
occ_rate = df.groupby(['Occupation']).mean()
occ_rate = occ_rate.sort_values(['LoanOriginalAmount'], ascending = False).index
# plot the vertical mean
def vertical_mean_line(x, **kwargs):
plt.axvline(x.mean(), color='red', linestyle='dashed', linewidth=2)
# create the plot
g = sb.FacetGrid(data = df, col = 'Occupation', col_wrap = 8, size = 4, sharex=False,col_order = occ_rate);
g.map(plt.hist, 'LoanOriginalAmount', bins = np.arange(1000, 35000, 1000));
g.map(vertical_mean_line, 'LoanOriginalAmount');
g.set_titles('{col_name}');
occ_loan_mean = df.groupby('Occupation')['LoanOriginalAmount'].mean().reset_index(name = 'MeanLoanOriginalAmount')
occ_loan_mean.MeanLoanOriginalAmount = occ_loan_mean.MeanLoanOriginalAmount.astype(float)
plt.figure(figsize = [3, 5])
sb.violinplot(data = occ_loan_mean, y = 'MeanLoanOriginalAmount', color = base_color);
#Plot along the occupations
plt.figure(figsize = [5, 12])
sb.barplot(y="Occupation", x="MeanLoanOriginalAmount", data=occ_loan_mean, color=base_color, order=occ_loan_mean.sort_values(['MeanLoanOriginalAmount'], ascending = False)['Occupation']) ;
plt.xticks(rotation=90);
plt.xticks(np.arange(1000,occ_loan_mean.MeanLoanOriginalAmount.max()+1000,1000))
plt.axvline(occ_loan_mean.MeanLoanOriginalAmount.describe()[1], color='red', linestyle='dashed', linewidth=2);
First let's compare the 2 main scoring attributes ProsperScore and ProsperRating,
df['ProsperScore_Num'] = df['ProsperScore'].astype(float)
fig, ax = plt.subplots(1,1)
# adjust the bins
bins_x = np.arange(1, 8, 1)
bins_y = np.arange(1, 12, 1)
img = ax.hist2d(data = df, x = 'ProsperRating_numeric', y = 'ProsperScore_Num',
bins = [bins_x, bins_y], cmap = 'viridis_r', cmin = 0.5 )
# adjust the labels
plt.xlabel('ProsperRating')
plt.ylabel('ProsperScore');
x_ticks_labels = ["HR", "E", "D", "C", "B", "A", "AA"]
# Set number of ticks for x-axis
ax.set_xticks(bins_x)
ax.set_yticks(bins_y)
# Set ticks labels for x-axis
ax.set_xticklabels(x_ticks_labels, fontsize=12, ha='left');
ax.set_yticklabels(labels=bins_y, va='baseline');
plt.colorbar(img[3], ax=ax);
Comment: The darker area somehow shows the relationsip between the 2. However Prosper Rating e.g. of "C" gives a range of from 2-11.
categorial_2d(data=df, x='Occupation',y='ProsperRating_Alpha',zoom_x=10,zoom_y=10,fig_size=[12,5], percent=True, annotate=True)
Occupation ProsperRating_Alpha Denominator: 77108
Comment: The majority of the different professions are concentrated in the area from C to A. What is the Score telling us?
# zoom to the first most count occupations
categorial_2d(data=df, x='Occupation',y='ProsperScore',zoom_x=10,zoom_y=10,fig_size=[12,5], percent=True, annotate=True)
Occupation ProsperScore Denominator: 77108
Comment: It ranges 2-10, so the extreme 1 and 11 are much lesser seen. Espeically the usual suspects Others, Professionals, Executives , Computer Programmer, Teacher etc. are underlining that.
Summary Who is using Prosper:
In general the occupation group "Other" is dominating the listings, whereas other occupations have a small proportion. This is a petty as it might be due to the fact that users have the possibility to choose from a ddlb the other category.
Let's move over to my 2nd question I had and pull some atributes for bi-variate analysis.
I had immidiatly 2 things in mind why the borrowers use Prosper instead using a normal bank. As we know now already that Debt Consolidation is the most frequent ListingType so loan type, what makes it so attractive? Is it the time from application to the point of time receiving the money. I call that time to money.
# create a time series by week
week_loan = df.groupby([pd.Grouper(key='LoanOriginationDate', freq='W-MON')])['ListingNumber'].count().reset_index().sort_values('LoanOriginationDate')
week_list = df.groupby([pd.Grouper(key='ListingCreationDate', freq='W-MON')])['ListingNumber'].count().reset_index().sort_values('ListingCreationDate')
week_list.rename(index=str, columns={"ListingCreationDate": "week", "ListingNumber": "ListingCount"}, inplace=True)
week_loan.rename(index=str, columns={"LoanOriginationDate": "week", "ListingNumber": "LoanCount"}, inplace=True)
week = pd.merge(week_list, week_loan, how='outer', on='week')
week.fillna(0,inplace=True)
week.set_index('week', inplace=True)
week.head()
| ListingCount | LoanCount | |
|---|---|---|
| week | ||
| 2009-07-13 | 12.0 | 0.0 |
| 2009-07-20 | 66.0 | 1.0 |
| 2009-07-27 | 51.0 | 3.0 |
| 2009-08-03 | 55.0 | 16.0 |
| 2009-08-10 | 61.0 | 54.0 |
# calculate the difference of Listing Creation Date vs. Loan Origination Date
df['Time2Money'] = df['LoanOriginationDate'] - df['ListingCreationDate']
# round to full days
df['Time2Money'] = df['Time2Money'].dt.round('1d')
df['Time2Money'] = df['Time2Money'].dt.days
df.Time2Money.describe()
count 77108.000000 mean 11.102674 std 17.459497 min 0.000000 25% 4.000000 50% 7.000000 75% 12.000000 max 530.000000 Name: Time2Money, dtype: float64
week_time2money_avg = df.groupby([pd.Grouper(key='ListingCreationDate', freq='W-MON')])['Time2Money'].mean().reset_index().sort_values('Time2Money')
# plot a time series
plt.figure(figsize = [10, 5]) # larger figure size for subplots
# histogram on left, example of too-large bin size
plt.subplot(2, 1, 1) # 1 row, 2 cols, subplot 1
ax = sb.lineplot(week.index, week.ListingCount, label='Listings created')
ax = sb.lineplot(week.index, week.LoanCount, label='Loans originated')
plt.subplot(2, 1, 2) # 1 row, 2 cols, subplot 2
ax2 = sb.lineplot(week_time2money_avg.ListingCreationDate, week_time2money_avg.Time2Money, label='Time2Money(days)')
Comment: Fast is different, however the platform Prosper seems to accelarate with increasing Listings. There is a clear downward trend in Time2Money attribute, which might attract the borrowers. What else could attract borrowers to use Prosper? What else could be reason to use Prosper? For sure the money you need to pay which can be summarized by the Borrower Annual % Rate.
# create a time series by week
week_apr = df.groupby([pd.Grouper(key='ListingCreationDate', freq='W-MON')])['BorrowerAPR'].mean().reset_index().sort_values('ListingCreationDate')
week_rate = df.groupby([pd.Grouper(key='ListingCreationDate', freq='W-MON')])['BorrowerRate'].mean().reset_index().sort_values('ListingCreationDate')
week_apr.set_index('ListingCreationDate', inplace=True)
week_rate.set_index('ListingCreationDate', inplace=True)
#week_apr.rename(index=str, columns={"ListingCreationDate": "week"}, inplace=True)
#week_rate.rename(index=str, columns={"ListingCreationDate": "week", "ListingNumber": "LoanCount"}, inplace=True)
# plot a a time series
plt.figure(figsize = [10, 5]) # larger figure size for subplots
# histogram on left, example of too-large bin size
plt.subplot(2, 1, 1) # 1 row, 2 cols, subplot 1
ax = sb.lineplot(week_apr.index, week_apr.BorrowerAPR, label='BorrowerAPR')
#ax = sb.lineplot(week.index, week.ListingCount, label='Listings created')
Comment: At the end of 2011 the rates have being considerable higher as before and it seems as this was the end of an upward trend, till the mid of 2014 it was constantly going down. Is there a difference on the Occupation?
# calculate the mean APR
occ_rate = df.groupby(['Occupation']).mean()
occ_rate = occ_rate.sort_values(['BorrowerAPR'], ascending = True).index
# plot the vertical mean
def vertical_mean_line(x, **kwargs):
plt.axvline(x.mean(), color='red', linestyle='dashed', linewidth=2)
# create the plot
g = sb.FacetGrid(data = df, col = 'Occupation', col_wrap = 8, size = 4, sharex=False, col_order = occ_rate);
g.map(plt.hist, 'BorrowerAPR', bins = np.arange(0, .5+0.01, 0.01));
g.map(vertical_mean_line, 'BorrowerAPR');
g.set_titles('{col_name}');
Student from a technical school having the best mean APR followed by Judges, Doctors, Invstors as an example. However one can see that the porportion of those aren't very high. The usual suspects in that dataset (Others, Professionalsetc.) come later. Let's check the distribution of the means and bring it to an order.
# let's plot the distribution, I know Q-Q plot is already bivraiate but it is already so useful
occ_rate_mean = df.groupby('Occupation')['BorrowerAPR'].mean().reset_index(name = 'MeanBorrowerAPR')
occ_rate_mean.MeanBorrowerAPR = occ_rate_mean.MeanBorrowerAPR.astype(float)
metric_distri_qq(occ_rate_mean.MeanBorrowerAPR, [12,5], 0.01, 0.01)
the means of the occupations seems to be nicely normal distributed... let's review them
plt.figure(figsize = [5, 12])
sb.barplot(y="Occupation", x="MeanBorrowerAPR", data=occ_rate_mean, color=base_color, order=occ_rate_mean.sort_values(['MeanBorrowerAPR'], ascending = True)['Occupation']) ;
plt.xticks(rotation=90);
plt.xticks(np.arange(0,df.BorrowerAPR.mean()+0.1,0.1))
plt.axvline(occ_rate_mean.MeanBorrowerAPR.describe()[1], color='red', linestyle='dashed', linewidth=2);
df.BorrowerAPR.mean()
0.2238441454842164
Comment: It's not a bargain, the avg. Borrower APR is appr. 22%. Let's look if there is a relation to the score. However Computer Programmer and Executives are getting far better rates then Professionals and the famous Others.
Summary: Why is Prosper used?
Finally move now to question ....
There are some "low Brainers" which we can check first.
plt.figure(figsize = [12, 5])
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 2, 1)
ax1 = sb.violinplot(data = df, y = 'ProsperRating_Alpha', x = 'BorrowerAPR', color = base_color, scale='width')
# right plot: box plot
plt.subplot(1, 2, 2)
sb.boxplot(data = df, y = 'ProsperRating_Alpha', x = 'BorrowerAPR', color = base_color);
plt.ylim(ax1.get_ylim());
Comment: The distributions are nicely climbing with decreasing the rating. Most of the distribution are bi- or multimodal. The spread in each category is as well large. On categories AA we find many outliers to the right. In A as well and C, D have outliers as well in both directions. E has outliers to the left. Again below as an example the density and viloin plot for "E"! HR has a relatively small IQR and many outliers to the left an dto the right.
So what else influences the APR?
df.DebtToIncomeRatio.isna().sum()
0
df.BorrowerAPR.isna().sum()
0
g = sb.JointGrid(data = df, x = 'BorrowerAPR', y = 'DebtToIncomeRatio')
g.plot_joint(plt.scatter)
g.plot_marginals(sb.rugplot, height = 0.25);
Comment: As we know from the beginning the correlation coeff = 0,128, so we have a weak linear relation. Below 2 the rates are going the full range we need to zoom in a bit. One thing which is looking strange is the nearly vertical line between 0,35 and appr. 0,37 along all DtIR.
# zoom to DtIR <= 2
d2ir = df.query('DebtToIncomeRatio <= 2')
g = sb.JointGrid(data = d2ir, x = 'BorrowerAPR', y = 'DebtToIncomeRatio')
g.plot_joint(plt.scatter)
g.plot_marginals(sb.rugplot, height = 0.25);
Below 1 and 0.5 DTIR there seems to be a concentration on the of rates...let's check quickly in a heatmap.
fig, ax = plt.subplots(1,1)
bins_x = np.arange(0.0, 0.5, 0.05)
bins_y = np.arange(0.0, 2.0, 0.1)
img = ax.hist2d(data = d2ir, x = 'BorrowerAPR', y = 'DebtToIncomeRatio',
bins = [bins_x, bins_y], cmap = 'viridis_r', cmin = 0.5 )
plt.xlabel('BorrowerAPR')
plt.ylabel('DebtToIncomeRatio');
# Set number of ticks for x-axis
ax.set_xticks(bins_x)
ax.set_yticks(bins_y)
plt.colorbar(img[3], ax=ax);
Comment: The majority of the values is concentrated between 0,1 and 0,4 and 0,10 to 0,25. We can see a slight upward trend in the concentrated area, but the rate also must be influenced by something else. Maybe the term?
plt.figure(figsize = [10, 4]);
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 2, 1);
ax1 = sb.violinplot(data = df, x = 'Term', y = 'BorrowerAPR', color = base_color, scale='width');
# center plot: box plot
plt.subplot(1, 2, 2);
sb.boxplot(data = df, x = 'Term', y = 'BorrowerAPR', color = base_color);
plt.ylim(ax1.get_ylim());
We know from the Univariate Analysis that 12 month term are very seldom bit they have a lower rates as a starting point of their distributions. So 36 seems to be multimodal and 60 right skewed. Still all 3 terms give overall a wide range of rates. In 36 we see the 3rd modality which is looking similar to the concentration we saw in the DTIR between 0,35 and 0,37 let's look closer. The Medians are close to each other, the range for 60 is smaller.
# filter to 0,35 to 0,60
df_035 = df.query('BorrowerRate >= 0.35 & BorrowerRate <= 0.6')
plt.figure(figsize = [10, 4]);
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 2, 1);
ax1 = sb.violinplot(data = df_035, x = 'Term', y = 'BorrowerAPR', color = base_color, scale='width');
# center plot: box plot
plt.subplot(1, 2, 2);
sb.boxplot(data = df_035, x = 'Term', y = 'BorrowerAPR', color = base_color);
plt.ylim(ax1.get_ylim());
df_035.Term.value_counts()
36 676 Name: Term, dtype: int64
sb.distplot(df_035['BorrowerAPR']);
Comment: So this can might explain the peak line we saw in the D2IR I was mentioning before, however either it is a combination of attributes or it is another attribute which is primarily deriving the rate. Let's check the Score.
# zoom to the 3rd quantile
plt.figure(figsize = [12, 5])
x = np.arange(0.1, 0.5, 0.05)
# left plot: scatterplot of discrete data with jitter and transparency
plt.subplot(1, 2, 1)
ax = sb.regplot(data = df, x = 'BorrowerAPR', y = 'ProsperScore', fit_reg = False,
x_jitter = 0.0, y_jitter = 0.0, scatter_kws = {'alpha' : 1/3})
# right plot: heat map with bin edges between values
ax.set_ylim(0,12)
ax.set_xlim(0,0.5)
plt.subplot(1, 2, 2)
bins_x = np.arange(0.1, 0.5, 0.01)
bins_y = np.arange(0, 13, 1)
plt.hist2d(data = df, x = 'BorrowerAPR', y = 'StatedMonthlyIncome',
bins = [bins_x, bins_y], cmap = 'viridis_r')
plt.colorbar();
Comment: We can see that the medians of the rates are increasing by decreasing the score. However 2 things are here interesting.
The groups 11 and 1 are relatively small. So there is still something else which controls the rates.Let's look to the income
df['IncomeVerifiable'].replace({True: 'Yes', False: 'No'}, inplace=True)
df.IncomeVerifiable.value_counts()
Yes 77060 No 48 Name: IncomeVerifiable, dtype: int64
plt.figure(figsize = [10, 4])
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 2, 1)
ax1 = sb.violinplot(data = df, y = 'IncomeVerifiable', x = 'BorrowerAPR', color = base_color, scale='count')
# right plot: box plot
plt.subplot(1, 2, 2)
sb.boxplot(data = df, y = 'IncomeVerifiable', x = 'BorrowerAPR', color = base_color);
plt.ylim(ax1.get_ylim());
Comment: The appr. 10% not having a verified income do get higher rates. So having all documents ready helps as well here. What else could influence the rate. Property?
df['IsBorrowerHomeowner'].replace({True: 'Yes', False: 'No'}, inplace=True)
df.IsBorrowerHomeowner.value_counts()
Yes 41841 No 35267 Name: IsBorrowerHomeowner, dtype: int64
plt.figure(figsize = [10, 4])
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 2, 1)
ax1 = sb.violinplot(data = df, y = 'IsBorrowerHomeowner', x = 'BorrowerAPR', color = base_color, scale='width')
# right plot: box plot
plt.subplot(1, 2, 2)
sb.boxplot(data = df, y = 'IsBorrowerHomeowner', x = 'BorrowerAPR', color = base_color);
plt.ylim(ax1.get_ylim());
Comment: To own a house definity helps. We can clearly see that the median rate is lower for houseowners. Ok let's finally check here amount requested.
df.StatedMonthlyIncome_ln.describe()
count 77108.000000 mean 3.702410 std 0.257981 min -0.602060 25% 3.549192 50% 3.698970 75% 3.855317 max 5.199572 Name: StatedMonthlyIncome_ln, dtype: float64
# zoom to the 3rd quantile
plt.figure(figsize = [12, 5])
x = np.arange(0.1, 0.6, 0.01)
# left plot: scatterplot of discrete data with jitter and transparency
plt.subplot(1, 2, 1)
ax = sb.regplot(data = df, x = 'BorrowerAPR', y = 'StatedMonthlyIncome_ln', fit_reg = False,
x_jitter = 0.0, y_jitter = 0.0, scatter_kws = {'alpha' : 1/3})
# right plot: heat map with bin edges between values
# ln10 scale
a = np.arange(-.7, 5.2, 1)
b = 10 ** a
b = np.around(b, decimals=0, out=None)
plt.yticks(a,b)
plt.subplot(1, 2, 2)
bins_x = np.arange(0.1, 0.6, 0.01)
bins_y = np.arange(-.7, 5.2, .05)
plt.hist2d(data = df, x = 'BorrowerAPR', y = 'StatedMonthlyIncome_ln',
bins = [bins_x, bins_y], cmap = 'viridis_r')
plt.yticks(a,b)
plt.colorbar();
Comment: we can see on the log scale that the most frequent area is 1900 and above 20000 (I mean it's monthly income). However the range of APR is still going from 5 - above 40%. Having a lot of money/month is not an indication to get better rates.
sb.regplot(data = pairplot_samp, x = 'BorrowerAPR', y = 'StatedMonthlyIncome', scatter_kws = {'alpha' : 1/3});
Comment: We see that the fit isn't really good but it emphasizes a bit the less income the higher the rate. In the heatmap we see a that the range goes from 0,1 to 0,4 through all income levels. Let's revisit the categorial income range.
df.IncomeRange.isna().sum()
0
df.IncomeRange.value_counts()
$50,000-74,999 23898 $25,000-49,999 21881 $100,000+ 13943 $75,000-99,999 13576 $1-24,999 3809 Not employed 1 Name: IncomeRange, dtype: int64
# massage quickly the data type
income = ['$100,000+', '$75,000-99,999','$50,000-74,999', '$25,000-49,999', '$1-24,999', 'Not employed']
df.IncomeRange = pd.Categorical(df.IncomeRange,
categories=income, ordered=True)
df.IncomeRange.value_counts()
$50,000-74,999 23898 $25,000-49,999 21881 $100,000+ 13943 $75,000-99,999 13576 $1-24,999 3809 Not employed 1 Name: IncomeRange, dtype: int64
plt.figure(figsize = [25, 5])
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 2, 1)
ax1 = sb.violinplot(data = df, y = 'IncomeRange', x = 'BorrowerAPR', color = base_color, scale='count')
# right plot: box plot
plt.subplot(1, 2, 2)
sb.boxplot(data = df, y = 'IncomeRange', x = 'BorrowerAPR', color = base_color);
plt.ylim(ax1.get_ylim());
# p mean
icr_rate = df.groupby(['IncomeRange']).mean()
icr_rate = icr_rate.sort_values(['BorrowerAPR'], ascending = True).index
def vertical_mean_line(x, **kwargs):
plt.axvline(x.mean(), color='red', linestyle='dashed', linewidth=2)
# create the plot
g = sb.FacetGrid(data = df, col = 'IncomeRange', col_wrap = 6, size = 4, sharex=False, col_order = icr_rate)
g.map(plt.hist, 'BorrowerAPR', bins = np.arange(0, .5+0.01, 0.01));
g.map(vertical_mean_line, 'BorrowerAPR');
g.set_titles('{col_name}');
Comment: This makes it really visible the platform Prosper is used by well earning lenders, the spike between 0.3 and 0.4. Having a higher income definitely helps to get the best rates, however there is much variance in the distributions. The proportion of below 25k and even unemployed is very small.
df.MonthlyLoanPayment.describe()
count 77108.000000 mean 297.465653 std 188.535826 min 0.240000 25% 159.770000 50% 258.030000 75% 392.810000 max 2251.510000 Name: MonthlyLoanPayment, dtype: float64
# zoom to the 3rd quantile
plt.figure(figsize = [12, 5])
x = np.arange(0.1, 0.6, 0.01)
# left plot: scatterplot of discrete data with jitter and transparency
plt.subplot(1, 2, 1)
ax = sb.regplot(data = df, x = 'BorrowerAPR', y = 'MonthlyLoanPayment', fit_reg = False,
x_jitter = 0.0, y_jitter = 0.0, scatter_kws = {'alpha' : 1/3})
# right plot: heat map with bin edges between values
ax.set_ylim(0,2500)
ax.set_xlim(0,0.6)
plt.subplot(1, 2, 2)
bins_x = np.arange(0, 0.6, 0.01)
bins_y = np.arange(1, 2500, 200)
plt.hist2d(data = df, x = 'BorrowerAPR', y = 'MonthlyLoanPayment',
bins = [bins_x, bins_y], cmap = 'viridis_r')
plt.colorbar();
Comment: The monthly payment seems to have a limit at appr. 1000$ this is where below most of the rates are concentrated a slight trend can be seen that lower rates are drawn by higher monthly payments. Also notice the peak between 0.3 and 0.4 and 200!
df.LoanOriginalAmount.describe()
count 77108.000000 mean 9300.966683 std 6398.995358 min 1000.000000 25% 4000.000000 50% 8000.000000 75% 14584.500000 max 35000.000000 Name: LoanOriginalAmount, dtype: float64
plt.figure(figsize = [12, 5])
x = np.arange(0.1, 0.6, 0.01)
# left plot: scatterplot of discrete data with jitter and transparency
plt.subplot(1, 2, 1)
sb.regplot(data = df, x = 'BorrowerAPR', y = 'LoanOriginalAmount', fit_reg = False,
x_jitter = 0.0, y_jitter = 0.0, scatter_kws = {'alpha' : 1/3})
# right plot: heat map with bin edges between values
plt.subplot(1, 2, 2)
bins_x = np.arange(0.1, 0.6, 0.01)
bins_y = np.arange(1000, 35000, 2000)
plt.hist2d(data = df, x = 'BorrowerAPR', y = 'LoanOriginalAmount',
bins = [bins_x, bins_y], cmap = 'viridis_r')
plt.colorbar();
Comment: The higher the loan the spread of rates decreases. pls. note the peak between 0.3. and 0.4 till 5000! What else can influence the rate, yes the estimate of loss and return for the lender.
plt.figure(figsize = [12, 5])
x = np.arange(0.1, 0.6, 0.01)
# left plot: scatterplot of discrete data with jitter and transparency
plt.subplot(1, 2, 1)
sb.regplot(data = df, x = 'BorrowerAPR', y = 'EstimatedLoss', fit_reg = False,
x_jitter = 0.0, y_jitter = 0.00, scatter_kws = {'alpha' : 1/3})
# right plot: heat map with bin edges between values
plt.subplot(1, 2, 2)
bins_x = np.arange(0.1, 0.6, 0.01)
bins_y = np.arange(0, 0.40, 0.01)
plt.hist2d(data = df, x = 'BorrowerAPR', y = 'EstimatedLoss',
bins = [bins_x, bins_y], cmap = 'viridis_r')
plt.colorbar();
Comment: Very nice correlation...let's see a fitted line
sb.regplot(data = pairplot_samp, x = 'BorrowerAPR', y = 'EstimatedLoss', color="g", scatter_kws = {'alpha' : 1/2});
Comment: The smaller the loss the smaller the APR. We can speak here about a strong negative relationship....so what about the estimated Return?
plt.figure(figsize = [12, 5])
x = np.arange(0.1, 0.6, 0.01)
# left plot: scatterplot of discrete data with jitter and transparency
plt.subplot(1, 2, 1)
sb.regplot(data = df, x = 'BorrowerAPR', y = 'EstimatedReturn', fit_reg = False,
x_jitter = 0.0, y_jitter = 0.00, scatter_kws = {'alpha' : 1/3})
# right plot: heat map with bin edges between values
plt.subplot(1, 2, 2)
bins_x = np.arange(0.1, 0.6, 0.01)
bins_y = np.arange(0, 0.40, 0.01)
plt.hist2d(data = df, x = 'BorrowerAPR', y = 'EstimatedReturn',
bins = [bins_x, bins_y], cmap = 'viridis_r')
plt.colorbar();
Comment: Surprisingly the Return is as well in negative relationship to the APR. Why is that? It's the way how it is calculated. The estimated yield which is the difference between the estimated yield and the loss.
Summary Who is using Prosper:
In general the occupation group "Other" is dominating the listings, whereas other occupations have a small proportion. This is a petty as it might be due to the fact that users have the possibility to choose from a ddlb the other category.
Summary: Why is Prosper used?
What is influencing the Rate?
BorrowerAPR vs. ProsperRating: The distributions are nicely climbing with decreasing the rating. Most of the distribution are bi- or multimodal. The spread in each category is as well large. On categories AA we find many outliers to the right. In A as well and C, D have outliers as well in both directions. E has outliers to the left. Again below as an example the density and viloin plot for "E"! HR has a relatively small IQR and many outliers to the left.BorrowerAPRvs. DebtToIncomeRatio: As we know from the beginning the correlation coeff = 0,128, so we have a weak linear relation. Below 2 the rates are going the full range we need to zoom in a bit. One thing which is looking strange is the nearly vertical line between 0,35 and appr. 0,37 along all D2IR.The majority of the values is concentrated between 0,1 and 0,4 and 0,10 to 0,25. We can see a slight upward trend in the concentrated area, but the rate also must be influenced by something else.BorrowerAPR vs Term: We know from the Univariate Analysis that 12 month term are very seldom bit they have a lower rates as a starting point of their distributions. So 36 seems to be multimodal and 60 right skewed. Still all 3 terms give overall a wide range of rates. In 36 we see the 3rd modality which is looking similar to the concentration we saw in the DTIR between 0,35 and 0,37 let's look closer. The Medians are close to each other, the range for 60 is smaller. So this can might explain the peak line we saw in the D2IR I was mentioning before, however either it is a combination of attributes or it is another attribute which is primarily deriving the rate.ProsperScore: We can see that the medians of the rates are increasing by decreasing the score. However 2 things are here interesting. BorrowerRate vs. IncomeVerifiable: The appr. 10% not having a verified income do get higher rates. So having all documents ready helps as well here.BorrowerAPR vs. IsBorrowerHomeowner: To own a house definity helps. We can clearly see that the median rate is lower for houseowners. BorrowerAPR vs. StatedMonthyIncome: We can see on the log scale that the most frequent area is 1900 and above 20000 (I mean it's monthly income). However the range of APR is still going from 5 - above 40%. Having a lot of money/month is not an indication to get better rates. We see that the fit isn't really good but it emphasizes a bit the less income the higher the rate. In the heatmap we see a that the range goes from 0,1 to 0,4 through all income levels.BorrowerAPR vs. IncomeRange: This makes it really visible the platform Prosper is used by well earning lenders, the spike between 0.3 and 0.4. Having a higher income definitely helps to get the best rates, however there is much variance in the distributions. The proportion of below 25k and even unemployed is very small. BorrowerAPR vs. MonthlyPayment: The monthly payment seems to have a limit at appr. 1000$ this is where below most of the rates are concentrated a slight trend can be seen that lower rates are drawn by higher monthly payments.BorrowerAPR vs. LoanAmount: The higher the loan the spread of rates decreases.BorrowerAPR vs. EstimatedLos: The smaller the loss the smaller the APR. We can speak here about a strong negative relationship....so what about the estimated Return?Estimated Return: Surprisingly the Return is as well in negative relationship to the APR. Why is that? It's the way how it is calculated. The estimated yield which is the difference between the estimated yield and the loss.Yes the Income Range was underlining the relationship of high incomes to better rates.
Let's put together some demographical attributes and soem attributes realted to finacials.
#top_n definitions
no_occ = 10
no_states = 10
no_li = 10
# create a new fied which either keeps the topn occoupation or moves it to the Other
occ_topn = df.Occupation.value_counts()[1:no_occ+1].index
occ_topn = occ_topn.astype('str')
def set_occ(occupation):
if occupation in occ_topn:
return(occupation)
else:
return('Other')
try:
df.drop(['Occupation_Condensed'], axis=1, inplace=True)
except Exception as e:
pass
## create a new attribute
df['Occupation_Condensed'] = df['Occupation'].apply(set_occ)
# change the sequence to set Other at the end
sequence_occ = occ_topn
#sequence_occ= sequence_occ.insert(len(occ_topn)+1, 'Other')
sequence_occ= sequence_occ.insert(0, 'Other')
df['Occupation_Condensed'] = pd.Categorical(df['Occupation_Condensed'], categories=sequence_occ, ordered=True)
# create a new fied which either keeps the topn states or moves it to the Other
states_topn = df.BorrowerState.value_counts()[0:no_states].index
states_topn = states_topn.astype('str')
def set_state(state):
if state in states_topn:
return(state)
else:
return('Other')
try:
df.drop(['StateCondensed'], axis=1, inplace=True)
except Exception as e:
pass
## create a new attribute
df['StateCondensed'] = df['BorrowerState'].apply(set_state)
# change the sequence to set Other at the end
sequence_state = states_topn
sequence_state= sequence_state.insert(len(states_topn)+1, 'Other')
df['StateCondensed'] = pd.Categorical(df['StateCondensed'], categories=sequence_state, ordered=True)
# copy again
samples = np.random.choice(df.shape[0], 7700, replace = False)
mv = df.iloc[samples,:]
li_topn = df.ListingCategory_alpha.value_counts()[0:no_li].index
li_topn = li_topn.astype('str')
def set_li(li):
if li in li_topn:
return(li)
else:
pass
try:
df.drop(['Listing_Cat_Condensed'], axis=1, inplace=True)
except Exception as e:
pass
## create a new attribute
df['Listing_Cat_Condensed'] = df['ListingCategory_alpha'].apply(set_li)
# change the sequence to set Other at the end
# change the sequence to set Other at the end
sequence_li = li_topn
#sequence_li= sequence_li.insert(len(states_topn)+1, 'Other')
df['Listing_Cat_Condensed'] = pd.Categorical(df['Listing_Cat_Condensed'], categories=sequence_li, ordered=True)
# common denominator
c_denominator = len(df)
name = "tab20c"
cmap = plt.get_cmap(name)
colors = cmap.colors # type: list
# create the main plot
fig, ax = plt.subplots(2,2,figsize=(25,15))
plt.suptitle('Prosper Demographic / Loan Inormation', fontdict={'fontsize': 18}, color='b')
plt.subplots_adjust(hspace = 0.3)
###### upper left #############################################
ul = ax[0,0]
# state
cat1_order = sequence_state[:10]
# Occ
#cat2_order = np.flip(sequence_occ)
cat2_order = sequence_occ
# HomeOwner
cat3_order = sequence_state[:10]
# first iteration build occ
baselines = np.zeros(len(cat1_order))
#build "vertical" by vertical the stacekd plot
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = df[df['Occupation_Condensed'] == cat2]['StateCondensed'].value_counts(normalize=False)
# then plot those counts on top of the accumulated baseline
ul.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order],
bottom = baselines , color=colors[i])
#removing top and right borders
ul.spines['top'].set_visible(False)
ul.spines['right'].set_visible(False)
ul.grid(False)
baselines += inner_counts[cat1_order]
total_counts_num = df['StateCondensed'].value_counts(normalize=False)
total_counts_perc = df['StateCondensed'].value_counts(normalize=True)
# text annotations
for i in range(len(cat1_order)):
y = str(format(total_counts_perc[i]*100 ,'.2f')) + '%'
ul .text(i-0.3, total_counts_num[cat1_order[i]]+200, y, va = 'center', color='black', fontdict={'fontsize': 12});
baselines = np.zeros(len(cat1_order))
ul.set_xticks(np.arange(len(cat1_order)))
ul.set_xticklabels(cat1_order);
ul.legend(cat2_order);
topn_total_perc = str(format((1 - total_counts_perc['Other']) * 100, '.2f')) + '%'
ul.set_title('Prosper Top 10 US States / Top 10 Occupations (' + topn_total_perc +')', fontdict={'fontsize': 15}, color='b');
ul.set_xlabel('Prosper States', fontdict={'fontsize': 12}, color='b');
ul.set_ylabel('Count States / Occupations', fontdict={'fontsize': 12}, color='b');
###### upper right #############################################
ur = ax[0,1]
# Occupations
cat1_order = sequence_occ[:10]
# Listings
cat2_order = sequence_li
# first iteration build occ
baselines = np.zeros(len(cat1_order))
#build "vertical" by vertical the stacekd plot
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = df[df['Listing_Cat_Condensed'] == cat2]['Occupation_Condensed'].value_counts(normalize=False)
# then plot those counts on top of the accumulated baseline
ur.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order],
bottom = baselines , color=colors[i])
#removing top and right borders
ur.spines['top'].set_visible(False)
ur.spines['right'].set_visible(False)
ur.grid(False)
baselines += inner_counts[cat1_order]
total_counts_num = df['Occupation_Condensed'].value_counts(normalize=False)
total_counts_perc = df['Occupation_Condensed'].value_counts(normalize=True)
# text annotations
for i in range(len(cat1_order)):
y = str(format(total_counts_perc[i]*100 ,'.2f')) + '%'
ur.text(i-0.3, total_counts_num[cat1_order[i]]+200, y, va = 'center', color='black', fontdict={'fontsize': 12});
baselines = np.zeros(len(cat1_order))
ur.set_xticks(np.arange(len(cat1_order)))
ur.set_xticklabels(cat1_order, rotation = 20, ha='right');
ur.legend(cat2_order);
topn_total_perc = str(format((1 - total_counts_perc['Other']) * 100, '.2f')) + '%'
ur.set_title('Prosper Top 10 Occupations / Top 10 Loan Types (' + topn_total_perc +')', fontdict={'fontsize': 15}, color='b');
ur.set_xlabel('Prosper Occupations', fontdict={'fontsize': 12}, color='b');
ur.set_ylabel('Count Occupations / Loan Types', fontdict={'fontsize': 12}, color='b');
##################################################################
###### Lower left #############################################
lf = ax[1,0]
lf.scatter(data = mv, x = 'StatedMonthlyIncome', y = 'LoanOriginalAmount', s = 'MonthlyLoanPayment', alpha = .1, color='lightblue')
lf.set_xlim(100,20000)
# dummy series for adding legend
sizes = [100,500, 1000 ]
base_color = sb.color_palette()[0]
legend_obj = []
# make legend
for s in sizes:
legend_obj.append(lf.scatter([], [], s = s, alpha = .5 , color = 'steelblue')) #color = base_color
lf.legend(legend_obj, sizes, title='Monthy Loan Payment ($)', loc='upper left',
labelspacing=2, borderpad=2);
lf.spines['top'].set_visible(False)
lf.spines['right'].set_visible(False)
lf.grid(False)
lf.set_title('Loan Amount vs. Monthly Income ($) based on a 10% Sample', fontdict={'fontsize': 15}, color='b');
lf.set_xlabel('Monthly Income ($)', fontdict={'fontsize': 12}, color='b');
lf.set_ylabel('Loan Amount ($)', fontdict={'fontsize': 12}, color='b');
##################################################################
###### Lower right #############################################
lr = ax[1,1]
#mv_m = mv.groupby('Occupation')['LoanOriginalAmount', 'StatedMonthlyIncome', 'MonthlyLoanPayment'].mean().reset_index()
#mv_m = mv_m[df_m['Occupation'].isin(sequence_occ)]
#mv_m['Occupation'] = df_m['Occupation'].astype('str')
mv_m = df.groupby('Occupation_Condensed')['LoanOriginalAmount', 'StatedMonthlyIncome', 'MonthlyLoanPayment'].mean().reset_index()
#mv_m = mv_m[mv_m['Occupation_Condensed'].isin(sequence_occ)]
mv_m['Occupation_Condensed'] = mv_m['Occupation_Condensed'].astype('str')
mv_m.sort_values('LoanOriginalAmount', ascending=False, inplace=True)
# bring to tidy format
tidy = mv_m.melt(id_vars='Occupation_Condensed').rename(columns=str.title)
sb.barplot(x='Occupation_Condensed', y='Value', hue='Variable', data=tidy, palette = colors)
lr.set_xticklabels(mv_m['Occupation_Condensed'], rotation = 20, ha='right');
lr.legend(title='Loan/Income ($)');
lr.spines['top'].set_visible(False)
lr.spines['right'].set_visible(False)
lr.grid(False)
lr.set_title('Mean Loan, Monthly Income and Monthly Loan Payment Amount ($)', fontdict={'fontsize': 15}, color='b');
lr.set_xlabel('Prosper Occupations', fontdict={'fontsize': 12}, color='b');
lr.set_ylabel('Loan/Income Amount ($)', fontdict={'fontsize': 12}, color='b');
Comment: Very interesting to see the demographical aspects together.
# first create new rolling avg field for the t2m
df['Time2Money_rolling']= df['Time2Money'].rolling(window=90).mean()
# fill the nas with mean.
df['Time2Money_rolling'].fillna(df['Time2Money'].mean(), inplace = True)
week_time2money_avg = df.groupby([pd.Grouper(key='ListingCreationDate', freq='W-MON')])['Time2Money', 'Time2Money_rolling'].mean().reset_index().sort_values('Time2Money')
plt.figure(figsize = [12, 5]) # larger figure size for subplots
ax = sb.lineplot(week_time2money_avg.ListingCreationDate, week_time2money_avg.Time2Money, label='Time2Money')
ax = sb.lineplot(week_time2money_avg.ListingCreationDate, week_time2money_avg.Time2Money_rolling, label='Time2Money / Rollong Avg. 90 days')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.grid(False)
ax.set_title('Time 2 Money / Time 2 Money 90day Moving Average \n', fontdict={'fontsize': 15}, color='b');
ax.set_xlabel('Losting Creation Date', fontdict={'fontsize': 12}, color='b');
ax.set_ylabel('Time to Money (days)', fontdict={'fontsize': 12}, color='b');
print(week_time2money_avg.Time2Money_rolling.mean())
11.15765999157341
Comment: So the 90 days moving average indicates that there is a nearly constant around appr. 11days. Is there a change by the level of debts and rating...
# Do the sample again
# plot matrix: sample 7000 (appr. 10%) so that plots are clearer and they render faster
# for later create a categorial Term
df['Term_Alpha'] = pd.Categorical(df.Term, categories=[12,36,60], ordered=True)
samples = np.random.choice(df.shape[0], 7700, replace = False)
mv = df.iloc[samples,:]
g = sb.FacetGrid(data = mv, hue = 'ProsperRating_Alpha', size = 6, despine=True, palette='tab20c')
g.map(plt.scatter, 'Time2Money', 'DebtToIncomeRatio',alpha = .5 )
g.set(ylim=(0, 5))
g.add_legend();
g.axes[0,0].grid(False)
g.axes[0,0].set_title(' Time to Money vs. Debt to Income Ratio (indebtedness) \n', fontdict={'fontsize': 15}, color='b');
g.axes[0,0].set_xlabel('Time to Money (days)', fontdict={'fontsize': 12}, color='b');
g.axes[0,0].set_ylabel('Debt to Income Ratio', fontdict={'fontsize': 12}, color='b');
Ok, pretty busy chart, let's zoom in
g = sb.FacetGrid(data = mv, hue = 'ProsperRating_Alpha', size = 10, palette='tab20c')
g.map(plt.scatter, 'Time2Money', 'DebtToIncomeRatio', alpha = .5)
g.set(xlim=(0.0, 30))
#g.set(ylim=(0, 50))
g.set(ylim=(0, 2))
g.add_legend();
g.axes[0,0].grid(False)
g.axes[0,0].set_title(' Time to Money vs. Debt to Income Ratio (indebtedness) \n Zoom: T2M <=30 / D2ICR <= 2',
fontdict={'fontsize': 15}, color='b');
g.axes[0,0].set_xlabel('Time to Money (days)', fontdict={'fontsize': 12}, color='b');
g.axes[0,0].set_ylabel('Debt to Income Ratio', fontdict={'fontsize': 12}, color='b');
Comment: There is a wide range of days between the listing creation and the payout of the loan. There is nothing spectacular here, maybe we check in combination with the loan amount by occupation...
## functions
def vertical_mean_line(x, **kwargs):
plt.axvline(x.mean(), color='green', linestyle='dashed', linewidth=2)
def horizontal_mean_line(x, **kwargs):
plt.axhline(x.mean(), color='orange', linestyle='dashed', linewidth=3)
def annotations(x,y, **kwargs):
avg_days = '{:0.0f} days'.format(x.mean())
plt.text(x.mean()+3, 38000, 'Avg. T2M: ' + avg_days , color='green', fontdict={'fontsize': 12});
avg_amount = '{:0.0f}$'.format(y.mean())
plt.text(x.mean()+15, y.mean()+ 3000, 'Avg. Loan Amount: \n' + avg_amount , color='orange', fontdict={'fontsize': 12, 'weight': 'heavy'});
# main plot
g = sb.FacetGrid(data = mv, col = 'Occupation_Condensed', col_wrap = 4, size = 5, sharex=True, sharey=True, despine=True)
g.map(plt.scatter, 'Time2Money','LoanOriginalAmount', alpha = .4, color='silver');
g.set(xlim=(0,40))
g.set(ylim=(0,40000))
g.map(vertical_mean_line,'Time2Money' );
g.map(horizontal_mean_line, 'LoanOriginalAmount');
g.map(annotations, 'Time2Money', 'LoanOriginalAmount');
g.set_titles('{col_name}');
for i in np.arange(g.axes.shape[0]):
g.axes[i].grid(False)
g.axes[0].set_ylabel('Original Loan Amount', fontdict={'fontsize': 12}, color='b');
g.axes[4].set_ylabel('Original Loan Amount', fontdict={'fontsize': 12}, color='b');
g.axes[8].set_ylabel('Original Loan Amount', fontdict={'fontsize': 12}, color='b');
g.axes[7].set_xlabel('Time to Money (days)', fontdict={'fontsize': 12}, color='b');
g.axes[8].set_xlabel('Time to Money (days)', fontdict={'fontsize': 12}, color='b');
g.axes[9].set_xlabel('Time to Money (days)', fontdict={'fontsize': 12}, color='b');
g.axes[10].set_xlabel('Time to Money (days)', fontdict={'fontsize': 12}, color='b');
g.fig.suptitle('Time to Money vs. Debt to Loan Amount \n Zoom: T2M <=40 / LoanAm. <= 40k', fontdict={'fontsize': 18}, color='b');
g.fig.subplots_adjust(top=.9)
df.Time2Money.mean()
11.102674171292213
df.LoanOriginalAmount.mean()
9300.966683093842
Comment: The values do not differ a lot by the top 10 occupations. As well as in comparison to the mean Time to Money (appr. 11days) and the mean Loan amount (appr. 9300$) over all occupations. This means the speed of the listing process is not significantly differing. If the Time to Money attribute is a factor for Borrowers to use Prosper can't be finally proofed as we would need to compare it with e.g. processing times of traditional banks.
What about the interest rates, let's check the rates, in particular the BorrowerAPR whcih includes all fees...
# functions
def vertical_mean_line(x, **kwargs):
plt.axvline(x.mean(), color='green', linestyle='dashed', linewidth=2)
def horizontal_mean_line(x, **kwargs):
plt.axhline(x.mean(), color='orange', linestyle='dashed', linewidth=3)
def annotations(x,y, **kwargs):
avg_perc = '{:0.2f}%'.format(x.mean()*100)
plt.text(x.mean()+.02, 38000, 'Avg. APR: ' + avg_perc , color='green', fontdict={'fontsize': 11});
avg_amount = '{:0.0f}$'.format(y.mean())
plt.text(x.mean()+.1, y.mean()+ 2000, 'Avg. Loan Amount: \n' + avg_amount , color='orange', fontdict={'fontsize': 11, 'weight': 'heavy'});
# main plot
g = sb.FacetGrid(data = mv, col = 'Occupation_Condensed', col_wrap = 4, size = 5, sharex=False)
g.map(plt.scatter, 'BorrowerAPR','LoanOriginalAmount', alpha = .4, color='silver');
g.map(vertical_mean_line, 'BorrowerAPR');
g.map(horizontal_mean_line, 'LoanOriginalAmount');
g.map(annotations, 'BorrowerAPR', 'LoanOriginalAmount');
g.set(xlim=(0,.5))
g.set(ylim=(0,40000))
g.set_titles('{col_name}');
for i in np.arange(g.axes.shape[0]):
g.axes[i].grid(False)
g.axes[0].set_ylabel('Original Loan Amount', fontdict={'fontsize': 12}, color='b');
g.axes[4].set_ylabel('Original Loan Amount', fontdict={'fontsize': 12}, color='b');
g.axes[8].set_ylabel('Original Loan Amount', fontdict={'fontsize': 12}, color='b');
g.axes[7].set_xlabel('Borrower APR (%)', fontdict={'fontsize': 12}, color='b');
g.axes[8].set_xlabel('Borrower APR (%)', fontdict={'fontsize': 12}, color='b');
g.axes[9].set_xlabel('Borrower APR (%)', fontdict={'fontsize': 12}, color='b');
g.axes[10].set_xlabel('Borrower APR (%)', fontdict={'fontsize': 12}, color='b');
#g.axes[2].set_title('LoanMount vs. APR by Occupation (Top10) \n', fontsize=18, color="b");
# special annot.
#arrow(x, y, dx, dy, **kwargs)[source]
g.axes[0].arrow(.1, 35000, .1, -9000, length_includes_head=True,head_width=.02, head_length=800, fc='r', ec='r');
g.axes[0].arrow(.05, 30000, .1, -9000, length_includes_head=True,head_width=.02, head_length=800, fc='r', ec='r');
g.fig.suptitle('BorrowerAPR / Loan Amount by Occupation (Top10) \n Zoom: PAR <=0.5 (50%) / LoanAm. <= 40k', fontdict={'fontsize': 18}, color='b');
g.fig.subplots_adjust(top=.9);
Comment: Loan Amount vs. the BorrowerAPR by Occupation is pretty much interesting. The mean APR and Loan is pretty much close to each other, however Computer Programmers get the best rates on average, 19,10%. Administrative Assistants get the worst rates, 25,59. Interesting is the fact that loan amounts e.g. 20k seem to have primarily rates better than the average of the respective occupation group. See above the red arrow annotations in the other group.
However if th rates are so good to attract the borowers would need to be analyzed with other data, e.g. data from bankloans for similar purposes and runtimes. For me the rates seem to be far high especially today (and compared to Germany).
This leads us to the question what influences most the BorrowerAPR?
The majority of borrowers are employed, which is evident because it wouldnt be easy to get a loan without a job. To dig further, i will investigate in the next part what are their occupations.
# Occupation sized by frequency
# Wordcloud
text = df.loc[:, 'Occupation'].str.cat(others=None, sep=' ')
# Read the mask image
usa_mask = np.array(Image.open('usa.jpg'))
# Instantiate word cloud object
wc = WordCloud(background_color='white', max_words=200, mask=usa_mask, stopwords=STOPWORDS,
width=640, height=480,contour_width=.01,
contour_color='green')
# Generate word cloud
wc.generate(text)
# Show word cloud
plt.figure(figsize = [12, 8])
plt.imshow(wc, interpolation='bilinear')
plt.title('Occupation sized by frequency', fontsize=20)
plt.axis('off');
Comment: Most borrowers on Prosper indicate to be Professional, Computer Programmer, Administrative Assistant, Executive, Teacher, Analyst... All those people have chosen to borrow on Prosper instead of going to the conventional way and borrow from their commerciaL bank. This is could be due to an attractive interest rate offered for this categories of people. That what we will going to see later.
#Create a new column which return 1 for good status of payment('Current','Completed','FinalPaymentInProgress')
# and 0 if there is an issue (defaulted,past due, charged-off payments)
status_good=['Current','Completed','FinalPaymentInProgress']
def cat_Loanstatus(row):
for i in status_good:
if i in row:
return 1
return 0
df["loanstatus_cat"] = df.LoanStatus.apply(lambda x: cat_Loanstatus(x))
df['CreditScore_average'] = (df['CreditScoreRangeLower'] + df['CreditScoreRangeUpper'])/2
status = df.groupby('loanstatus_cat').agg({'CreditScore_average':'mean','BorrowerRate':'mean'})
status = status.reset_index()
status["loanstatus_cat"].replace({0: "Delinquent", 1: "Good"}, inplace=True)
status = status.sort_values(['CreditScore_average'],ascending=False).head(10)
status
| loanstatus_cat | CreditScore_average | BorrowerRate | |
|---|---|---|---|
| 1 | Good | 709.739930 | 0.187714 |
| 0 | Delinquent | 695.633036 | 0.250538 |
# Create a figure
fig = plt.figure(figsize=(8,8))
# Create matplotlib axes
ax = fig.add_subplot(111)
# Create another axes
ax2 = ax.twinx()
width = 0.20
CreditScore_average = status.plot(x='loanstatus_cat', y='CreditScore_average', kind='bar', color = 'purple',
width=width,ax=ax, position = 0, label = 'Average CreditScore')
BorrowerRate = status.plot(x='loanstatus_cat', y='BorrowerRate', kind='bar', color = 'gray',
width=width,ax=ax2, position = 1, label = 'Borrower Rate')
ax.set_ylabel('Credit Score average')
CreditScore_average.legend(loc='upper right')
ax2.set_ylabel('Borrower Rate')
ax2.legend(loc='upper left')
ax.set_xlabel('Loan status categories')
plt.title('Borrower Rate and Average Credit Score by Delinquency Status');
Comment: Its clear now that for people who are delinquent (defaulted,past due, charged-off payments), banks applies more restricted credit conditions (higher interest rates). Also these people have lower credit scores than people with good status.
# Convert 'ListingCreationDate' type from str to datetime
df['ListingCreationDate']=pd.to_datetime(df['ListingCreationDate'])
# Extract date information from ListingCreationDate column
df['date'] = df['ListingCreationDate'].dt.date
# Extract year information from ListingCreationDate column
df['year']= df['ListingCreationDate'].dt.year
# Convert 'date' type from str to datetime
df['date']=pd.to_datetime(df['date'])
# df['year']=pd.to_datetime(df['year'])
# plots the lineplot to analyze homeowner's and non-homeowner's rate over years
plt.figure(figsize=(10,10))
g=sb.lineplot(data=df, x='year', y='BorrowerRate', hue='IsBorrowerHomeowner',ci=None);
Comment: As we can see, the two rates have had the same evolution profile. As expected, rates for individuals with collateral(homeowner) are lower than those without collateral, but the gap between the two rates has decreased significantly since 2011.
color_base=sb.color_palette()[0]
df["loanstatus_cat"].replace({0: "Delinquent", 1: "Good"}, inplace=True)
plt.figure(figsize=[8, 5])
sb.boxplot(data=df,x='EmploymentStatus',y='BorrowerAPR',hue='loanstatus_cat',dodge=0.3, color=color_base);
plt.xticks(rotation=45);
plt.legend(loc = 6, bbox_to_anchor = (1.0, 0.5)) ;
Comment: This figure confirmes all what have been said already that applied borrower rate is higher for unemployed people and delinquent loans( loans with bad records).
Let's review some rating, income and loan attributes....
g = sb.PairGrid(data = mv, x_vars = ['LoanOriginalAmount', 'DebtToIncomeRatio','BorrowerAPR', 'EstimatedLoss', ],
y_vars = ['ProsperRating_Alpha','IncomeRange', 'IncomeVerifiable', 'IsBorrowerHomeowner', 'Term_Alpha'], size=3, aspect=1.5, dropna=True)
g.map(sb.violinplot, color='skyblue', saturation=3);
g.axes[0,0].set_xlim(-1000,20000)
g.axes[0,1].set_xlim(-0.2,1)
g.fig.suptitle('ProsperRating/IncomeRange vs. LoanMount, APR and Estimated Loss', fontdict={'fontsize': 18}, color='b');
g.fig.subplots_adjust(top=.9)
for i in np.arange(g.axes.shape[0]):
for j in np.arange(g.axes.shape[1]):
g.axes[i,j].grid(False)
Comment:
# attributes for metric variables
variables = ["BorrowerAPR", "DebtToIncomeRatio", "EstimatedLoss"]
g = sb.PairGrid(data = mv, vars = variables, size = 5, hue="ProsperRating_Alpha", palette ='tab20c')
g = g.map_diag(plt.hist, bins = 40, alpha = .5);
g.map_offdiag(plt.scatter, alpha = .5);
for i in np.arange(g.axes.shape[0]):
for j in np.arange(g.axes.shape[1]):
g.axes[i,j].grid(False)
# let's improve some axis limits
g.axes[0,1].set_xlim(0,2)
g.add_legend();
g.fig.suptitle('APR, DebtToIncomeRatio, EstimatedLoss by ProsperRating ', fontdict={'fontsize': 18}, color='b');
g.fig.subplots_adjust(top=.9);
Comment: We nicely that the Prosper Rating is a mighty evaluation metric. We can clearly see that the distribution of APR is getting worse from HR to AA. Furthermore we can see that the rates are increasing with higher indebtedness (at least slightly) and decreasing Prosper Rating. Same for the estimated loss it gets slightly higher by increasing indebtedness, but much more by the assignment to the respective Prosper Rating.
# Converting this report to html format
from subprocess import call
call(['python', '-m', 'nbconvert', 'exploration_template.ipynb'])
4294967295